In [ ]:
Copied!
# ruff: noqa: E402
# ruff: noqa: E402
Perla Papersbot Analysis
This notebook retrieves data from the Perla Papersbot and plots the filtering steps.
In [1]:
Copied!
import io
import time
import zipfile
import pandas as pd
import plotly.graph_objects as go
import requests
from plotly_theme import register_template, set_defaults # type: ignore
# Register and set default Plotly theme for consistent styling
register_template()
set_defaults()
import io
import time
import zipfile
import pandas as pd
import plotly.graph_objects as go
import requests
from plotly_theme import register_template, set_defaults # type: ignore
# Register and set default Plotly theme for consistent styling
register_template()
set_defaults()
Overview¶
- Data Retrieval: Downloads Perla Papersbot run log files stored in NOMAD.
- Data Processing: Loads and processes the downloaded CSV files to extract statistics related to paper matching, abstract availability, and open-access status.
- Visualisation: Generates a Sankey diagram to visually represent the multi-stage filtering pipeline for identifying new perovskite solar cell papers, showing the flow of papers through different stages of filtering.
- Summary: Provides a textual summary of the filtering process, including the total number of papers parsed, initial matches, papers filtered out, and the final count of relevant and open-access papers.
1. Data Retrieval¶
In [2]:
Copied!
local_dir = 'paperbot_runs' # Local directory to save the downloaded files
# Function to download files from NOMAD upload
def download_archive(upload_id: str = 'TZL3dKwGT8O5Rjr5_13g4g', dest_path=local_dir):
res = requests.get(
f'https://nomad-lab.eu/prod/v1/api/v1/uploads/{upload_id}/raw/runs?offset=0&length=-1&decompress=false&ignore_mime_type=false&compress=true',
headers={
'Accept': 'application/octet-stream',
},
timeout=120,
)
res.raise_for_status()
zip_in_memory = io.BytesIO(res.content)
with zipfile.ZipFile(zip_in_memory) as zip_ref:
zip_ref.extractall(dest_path)
print(f'Downloaded and extracted archive to {dest_path}')
local_dir = 'paperbot_runs' # Local directory to save the downloaded files
# Function to download files from NOMAD upload
def download_archive(upload_id: str = 'TZL3dKwGT8O5Rjr5_13g4g', dest_path=local_dir):
res = requests.get(
f'https://nomad-lab.eu/prod/v1/api/v1/uploads/{upload_id}/raw/runs?offset=0&length=-1&decompress=false&ignore_mime_type=false&compress=true',
headers={
'Accept': 'application/octet-stream',
},
timeout=120,
)
res.raise_for_status()
zip_in_memory = io.BytesIO(res.content)
with zipfile.ZipFile(zip_in_memory) as zip_ref:
zip_ref.extractall(dest_path)
print(f'Downloaded and extracted archive to {dest_path}')
In [10]:
Copied!
download_archive()
download_archive()
Downloaded and extracted archive to paperbot_runs
2. Data Processing¶
In [4]:
Copied!
def get_stats():
# Load post-processed data and full entry statistics
post_proc_df = pd.read_csv(f'{local_dir}/post_proc.csv').replace({float('nan'): ''})
post_proc_df['pdf_available'] = post_proc_df['pdf_available'].apply(
lambda x: False if x == '' else x
)
full_df = pd.read_csv(f'{local_dir}/entry_stats.csv').replace({float('nan'): ''})
# Filter data based on different matching criteria
full_rss = full_df[full_df['match'] == 1]
full_strict_rss = full_df[full_df['strict_regex'] > 2]
strict_rss_with_doi = post_proc_df[post_proc_df['strict_regex'] > 2]
full_relaxed_rss = full_df[(full_df['match'] == 1) & (full_df['strict_regex'] <= 2)]
relaxed_rss_with_doi = post_proc_df[
(post_proc_df['match'] == 1) & (post_proc_df['strict_regex'] <= 2)
]
# Store dataframes in a dictionary for easier access
dfs = {
'full': (full_rss, post_proc_df),
'strict_rss': (full_strict_rss, strict_rss_with_doi),
'relaxed_rss': (full_relaxed_rss, relaxed_rss_with_doi),
}
# Initialize statistics dictionary
match_df = post_proc_df[
(post_proc_df['abstract_match']) & (post_proc_df['doi_good_to_go'])
]
stats = {
'total': len(full_df),
'oa': len(match_df[match_df['pdf_available']]),
'non_oa': len(
match_df[match_df['pdf_url'] == '']
), # Non-OA with abstract match
'oa_no_info': len(match_df[match_df['pdf_url'].apply(lambda x: 'Error' in x)]),
} # OA with errors in URL
# Calculate statistics for each category (full, strict_rss, relaxed_rss)
for key, (fdf, doi_df) in dfs.items():
abs_df = doi_df[doi_df['abstract_found']]
stats[f'{key}_match'] = len(fdf)
stats[f'{key}_match_with_doi'] = len(doi_df)
stats[f'{key}_match_missing_doi'] = len(fdf) - len(doi_df)
stats[f'{key}_abstracts_found'] = len(abs_df)
stats[f'{key}_missing_abstracts'] = len(doi_df) - len(abs_df)
stats[f'{key}_strict_matches_with_abstract_found'] = len(
abs_df[abs_df['abstract_match']]
)
stats[f'{key}_strict_matches_without_abstract'] = len(
doi_df[doi_df['abstract_match']]
) - len(abs_df[abs_df['abstract_match']])
stats[f'{key}_total_strict_matches'] = len(doi_df[doi_df['abstract_match']])
stats[f'{key}_doi_good_matches_with_abstract_found'] = len(
abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])]
)
stats[f'{key}_doi_good_matches_without_abstract'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
) - len(abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])])
stats[f'{key}_total_doi_good_matches'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
)
# Ensure all stats are integers
stats = {k: int(v) for k, v in stats.items()}
# Get the time range of parsed data
parsed_times = full_df['parsed_time'].values
start = min(parsed_times)
end = max(parsed_times)
stats['period'] = end - start
stats['start'] = time.strftime('%d-%m-%Y', time.gmtime(start))
stats['end'] = time.strftime('%d-%m-%Y', time.gmtime(end))
return stats
def get_stats():
# Load post-processed data and full entry statistics
post_proc_df = pd.read_csv(f'{local_dir}/post_proc.csv').replace({float('nan'): ''})
post_proc_df['pdf_available'] = post_proc_df['pdf_available'].apply(
lambda x: False if x == '' else x
)
full_df = pd.read_csv(f'{local_dir}/entry_stats.csv').replace({float('nan'): ''})
# Filter data based on different matching criteria
full_rss = full_df[full_df['match'] == 1]
full_strict_rss = full_df[full_df['strict_regex'] > 2]
strict_rss_with_doi = post_proc_df[post_proc_df['strict_regex'] > 2]
full_relaxed_rss = full_df[(full_df['match'] == 1) & (full_df['strict_regex'] <= 2)]
relaxed_rss_with_doi = post_proc_df[
(post_proc_df['match'] == 1) & (post_proc_df['strict_regex'] <= 2)
]
# Store dataframes in a dictionary for easier access
dfs = {
'full': (full_rss, post_proc_df),
'strict_rss': (full_strict_rss, strict_rss_with_doi),
'relaxed_rss': (full_relaxed_rss, relaxed_rss_with_doi),
}
# Initialize statistics dictionary
match_df = post_proc_df[
(post_proc_df['abstract_match']) & (post_proc_df['doi_good_to_go'])
]
stats = {
'total': len(full_df),
'oa': len(match_df[match_df['pdf_available']]),
'non_oa': len(
match_df[match_df['pdf_url'] == '']
), # Non-OA with abstract match
'oa_no_info': len(match_df[match_df['pdf_url'].apply(lambda x: 'Error' in x)]),
} # OA with errors in URL
# Calculate statistics for each category (full, strict_rss, relaxed_rss)
for key, (fdf, doi_df) in dfs.items():
abs_df = doi_df[doi_df['abstract_found']]
stats[f'{key}_match'] = len(fdf)
stats[f'{key}_match_with_doi'] = len(doi_df)
stats[f'{key}_match_missing_doi'] = len(fdf) - len(doi_df)
stats[f'{key}_abstracts_found'] = len(abs_df)
stats[f'{key}_missing_abstracts'] = len(doi_df) - len(abs_df)
stats[f'{key}_strict_matches_with_abstract_found'] = len(
abs_df[abs_df['abstract_match']]
)
stats[f'{key}_strict_matches_without_abstract'] = len(
doi_df[doi_df['abstract_match']]
) - len(abs_df[abs_df['abstract_match']])
stats[f'{key}_total_strict_matches'] = len(doi_df[doi_df['abstract_match']])
stats[f'{key}_doi_good_matches_with_abstract_found'] = len(
abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])]
)
stats[f'{key}_doi_good_matches_without_abstract'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
) - len(abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])])
stats[f'{key}_total_doi_good_matches'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
)
# Ensure all stats are integers
stats = {k: int(v) for k, v in stats.items()}
# Get the time range of parsed data
parsed_times = full_df['parsed_time'].values
start = min(parsed_times)
end = max(parsed_times)
stats['period'] = end - start
stats['start'] = time.strftime('%d-%m-%Y', time.gmtime(start))
stats['end'] = time.strftime('%d-%m-%Y', time.gmtime(end))
return stats
3. Visualisation¶
In [20]:
Copied!
def complex_sankey():
# Get statistics from the data
stats = get_stats()
# Define labels for the Sankey diagram nodes
labels = [
'RSS Summary <br>Keyword Match',
'Un-Resolvable Reference',
'Abstract Found',
'Abstract Missing',
'Filtered Papers',
'Open-Access',
'Non Open-Access',
'Failed Retrieval',
]
# Define source and target nodes for the links
sources = [0, 0, 0, 2, 3, 4, 4, 4]
targets = [1, 2, 3, 4, 4, 5, 6, 7]
# Define the values (thickness) for the links based on calculated statistics
values = [
stats['full_match_missing_doi'], # Un-Resolvable Reference
stats['full_abstracts_found'], # Abstract Found
stats['full_missing_abstracts'], # Abstract Missing
stats['full_doi_good_matches_with_abstract_found'], # Keyword Match,
stats['full_doi_good_matches_without_abstract'], # Keyword Match
stats['oa'], # Open-Access
stats['non_oa'], # Non Open-Acess
stats['oa_no_info'], # Failed Retrieval
]
# Calculate the total value for each node to display in the label
node_values = []
for i in range(len(labels)):
v = 0
node_list = targets
if i not in targets:
node_list = sources
for j in range(len(node_list)):
v += values[j] if node_list[j] == i else 0
node_values.append(v)
# Format labels to include node values
labels = [f'<b>{i}<br>{v}</b>' for i, v in zip(labels, node_values)]
# Create the Sankey diagram figure
fig = go.Figure(
data=[
go.Sankey(
valueformat='.0f',
arrangement='snap',
node=dict(
pad=16,
thickness=10,
line=dict(color='black', width=0.5),
label=labels,
align='left',
),
link=dict(
source=sources,
target=targets,
value=values,
color='rgba(0,0,255,0.2)',
),
)
]
)
# Set a title for the plot including total papers and date range
fig.update_layout(
title_text=f"<b>{stats['total']} papers parsed from {stats['start']} to {stats['end']}</b>",
font_size=12,
width=800,
)
# Display the plot
fig.show(renderer='svg')
return fig, stats
def complex_sankey():
# Get statistics from the data
stats = get_stats()
# Define labels for the Sankey diagram nodes
labels = [
'RSS Summary
Keyword Match', 'Un-Resolvable Reference', 'Abstract Found', 'Abstract Missing', 'Filtered Papers', 'Open-Access', 'Non Open-Access', 'Failed Retrieval', ] # Define source and target nodes for the links sources = [0, 0, 0, 2, 3, 4, 4, 4] targets = [1, 2, 3, 4, 4, 5, 6, 7] # Define the values (thickness) for the links based on calculated statistics values = [ stats['full_match_missing_doi'], # Un-Resolvable Reference stats['full_abstracts_found'], # Abstract Found stats['full_missing_abstracts'], # Abstract Missing stats['full_doi_good_matches_with_abstract_found'], # Keyword Match, stats['full_doi_good_matches_without_abstract'], # Keyword Match stats['oa'], # Open-Access stats['non_oa'], # Non Open-Acess stats['oa_no_info'], # Failed Retrieval ] # Calculate the total value for each node to display in the label node_values = [] for i in range(len(labels)): v = 0 node_list = targets if i not in targets: node_list = sources for j in range(len(node_list)): v += values[j] if node_list[j] == i else 0 node_values.append(v) # Format labels to include node values labels = [f'{i}
{v}' for i, v in zip(labels, node_values)] # Create the Sankey diagram figure fig = go.Figure( data=[ go.Sankey( valueformat='.0f', arrangement='snap', node=dict( pad=16, thickness=10, line=dict(color='black', width=0.5), label=labels, align='left', ), link=dict( source=sources, target=targets, value=values, color='rgba(0,0,255,0.2)', ), ) ] ) # Set a title for the plot including total papers and date range fig.update_layout( title_text=f"{stats['total']} papers parsed from {stats['start']} to {stats['end']}", font_size=12, width=800, ) # Display the plot fig.show(renderer='svg') return fig, stats
Keyword Match', 'Un-Resolvable Reference', 'Abstract Found', 'Abstract Missing', 'Filtered Papers', 'Open-Access', 'Non Open-Access', 'Failed Retrieval', ] # Define source and target nodes for the links sources = [0, 0, 0, 2, 3, 4, 4, 4] targets = [1, 2, 3, 4, 4, 5, 6, 7] # Define the values (thickness) for the links based on calculated statistics values = [ stats['full_match_missing_doi'], # Un-Resolvable Reference stats['full_abstracts_found'], # Abstract Found stats['full_missing_abstracts'], # Abstract Missing stats['full_doi_good_matches_with_abstract_found'], # Keyword Match, stats['full_doi_good_matches_without_abstract'], # Keyword Match stats['oa'], # Open-Access stats['non_oa'], # Non Open-Acess stats['oa_no_info'], # Failed Retrieval ] # Calculate the total value for each node to display in the label node_values = [] for i in range(len(labels)): v = 0 node_list = targets if i not in targets: node_list = sources for j in range(len(node_list)): v += values[j] if node_list[j] == i else 0 node_values.append(v) # Format labels to include node values labels = [f'{i}
{v}' for i, v in zip(labels, node_values)] # Create the Sankey diagram figure fig = go.Figure( data=[ go.Sankey( valueformat='.0f', arrangement='snap', node=dict( pad=16, thickness=10, line=dict(color='black', width=0.5), label=labels, align='left', ), link=dict( source=sources, target=targets, value=values, color='rgba(0,0,255,0.2)', ), ) ] ) # Set a title for the plot including total papers and date range fig.update_layout( title_text=f"{stats['total']} papers parsed from {stats['start']} to {stats['end']}", font_size=12, width=800, ) # Display the plot fig.show(renderer='svg') return fig, stats
4. Summary¶
In [7]:
Copied!
f"The multi-stage filtering pipeline for identifying new perovskite solar cell papers. \
Over a {int(stats['period'] / 86400)}-day period ({stats['start']} to {stats['end']}), {stats['total']} papers were parsed from Journal RSS feeds. \
An initial match against RSS summaries identified {stats['full_match']} candidates. Subsequent steps remove papers with unresolvable DOIs (n = {stats['full_match_missing_doi']}),\
and failing a secondary strict match (n = {stats['full_match'] - stats['full_total_strict_matches'] - stats['full_match_missing_doi']}). \
Further filtering is done to exclude theoretical, computational and review works (n = {stats['full_total_strict_matches'] - stats['full_total_doi_good_matches']}), yielding a final set of {stats['full_total_doi_good_matches']} relevant papers of which {stats['oa']} were open-access papers."
f"The multi-stage filtering pipeline for identifying new perovskite solar cell papers. \
Over a {int(stats['period'] / 86400)}-day period ({stats['start']} to {stats['end']}), {stats['total']} papers were parsed from Journal RSS feeds. \
An initial match against RSS summaries identified {stats['full_match']} candidates. Subsequent steps remove papers with unresolvable DOIs (n = {stats['full_match_missing_doi']}),\
and failing a secondary strict match (n = {stats['full_match'] - stats['full_total_strict_matches'] - stats['full_match_missing_doi']}). \
Further filtering is done to exclude theoretical, computational and review works (n = {stats['full_total_strict_matches'] - stats['full_total_doi_good_matches']}), yielding a final set of {stats['full_total_doi_good_matches']} relevant papers of which {stats['oa']} were open-access papers."
Out[7]:
'The multi-stage filtering pipeline for identifying new perovskite solar cell papers. Over a 153-day period (24-09-2025 to 25-02-2026), 124519 papers were parsed from Journal RSS feeds. An initial match against RSS summaries identified 2615 candidates. Subsequent steps remove papers with unresolvable DOIs (n = 146),and failing a secondary strict match (n = 1750). Further filtering is done to exclude theoretical, computational and review works (n = 240), yielding a final set of 479 relevant papers of which 100 were open-access papers.'
In [ ]:
Copied!