In [ ]:
Copied!
# ruff: noqa: E402
# ruff: noqa: E402
Perovskite Papersbot Analysis
This notebook retrieves data from the Perovskite Papersbot and plots the filtering steps.
In [1]:
Copied!
import time
import pandas as pd
import plotly.graph_objects as go
from huggingface_hub import HfApi, snapshot_download
from plotly_theme import register_template, set_defaults # type: ignore
# Register and set default Plotly theme for consistent styling
register_template()
set_defaults()
import time
import pandas as pd
import plotly.graph_objects as go
from huggingface_hub import HfApi, snapshot_download
from plotly_theme import register_template, set_defaults # type: ignore
# Register and set default Plotly theme for consistent styling
register_template()
set_defaults()
/Users/pepemarquez/git/Pepe-Marquez/nomad-distro-dev/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
Overview¶
- Data Retrieval: Downloads Perovskite Papersbot run log files stored in Hugging Face repo
pilar12/perovskite-papersbot. - Data Processing: Loads and processes the downloaded CSV files to extract statistics related to paper matching, abstract availability, and open-access status.
- Visualisation: Generates a Sankey diagram to visually represent the multi-stage filtering pipeline for identifying new perovskite solar cell papers, showing the flow of papers through different stages of filtering.
- Summary: Provides a textual summary of the filtering process, including the total number of papers parsed, initial matches, papers filtered out, and the final count of relevant and open-access papers.
1. Data Retrieval¶
In [2]:
Copied!
# Initialize Hugging Face API token and repository ID
repo_id = 'pilar12/perovskite-papersbot'
api = HfApi()
local_dir = 'paperbot_runs'
# Function to download files from the Hugging Face repository
def download_files():
snapshot_path = snapshot_download(
repo_id=repo_id,
local_dir=local_dir, # Local directory to save the downloaded files
repo_type='dataset',
revision='comb_regex',
force_download=True,
)
return snapshot_path
# Initialize Hugging Face API token and repository ID
repo_id = 'pilar12/perovskite-papersbot'
api = HfApi()
local_dir = 'paperbot_runs'
# Function to download files from the Hugging Face repository
def download_files():
snapshot_path = snapshot_download(
repo_id=repo_id,
local_dir=local_dir, # Local directory to save the downloaded files
repo_type='dataset',
revision='comb_regex',
force_download=True,
)
return snapshot_path
In [3]:
Copied!
download_files()
download_files()
Fetching 14 files: 100%|██████████| 14/14 [00:03<00:00, 3.89it/s]
Out[3]:
'/Users/pepemarquez/git/Pepe-Marquez/nomad-distro-dev/packages/nomad-perovskite-solar-cells-database/src/perovskite_solar_cell_database/example_uploads/perla_notebooks/paperbot_runs'
2. Data Processing¶
In [4]:
Copied!
def get_stats():
# Load post-processed data and full entry statistics
post_proc_df = pd.read_csv(f'{local_dir}/post_proc.csv').replace({float('nan'): ''})
post_proc_df['pdf_available'] = post_proc_df['pdf_available'].apply(
lambda x: False if x == '' else x
)
full_df = pd.read_csv(f'{local_dir}/entry_stats.csv').replace({float('nan'): ''})
# Filter data based on different matching criteria
full_rss = full_df[full_df['match'] == 1]
full_strict_rss = full_df[full_df['strict_regex'] > 2]
strict_rss_with_doi = post_proc_df[post_proc_df['strict_regex'] > 2]
full_relaxed_rss = full_df[(full_df['match'] == 1) & (full_df['strict_regex'] <= 2)]
relaxed_rss_with_doi = post_proc_df[
(post_proc_df['match'] == 1) & (post_proc_df['strict_regex'] <= 2)
]
# Store dataframes in a dictionary for easier access
dfs = {
'full': (full_rss, post_proc_df),
'strict_rss': (full_strict_rss, strict_rss_with_doi),
'relaxed_rss': (full_relaxed_rss, relaxed_rss_with_doi),
}
# Initialize statistics dictionary
match_df = post_proc_df[
(post_proc_df['abstract_match']) & (post_proc_df['doi_good_to_go'])
]
stats = {
'total': len(full_df),
'oa': len(match_df[match_df['pdf_available']]),
'non_oa': len(
match_df[match_df['pdf_url'] == '']
), # Non-OA with abstract match
'oa_no_info': len(match_df[match_df['pdf_url'].apply(lambda x: 'Error' in x)]),
} # OA with errors in URL
# Calculate statistics for each category (full, strict_rss, relaxed_rss)
for key, (fdf, doi_df) in dfs.items():
abs_df = doi_df[doi_df['abstract_found']]
stats[f'{key}_match'] = len(fdf)
stats[f'{key}_match_with_doi'] = len(doi_df)
stats[f'{key}_match_missing_doi'] = len(fdf) - len(doi_df)
stats[f'{key}_abstracts_found'] = len(abs_df)
stats[f'{key}_missing_abstracts'] = len(doi_df) - len(abs_df)
stats[f'{key}_strict_matches_with_abstract_found'] = len(
abs_df[abs_df['abstract_match']]
)
stats[f'{key}_strict_matches_without_abstract'] = len(
doi_df[doi_df['abstract_match']]
) - len(abs_df[abs_df['abstract_match']])
stats[f'{key}_total_strict_matches'] = len(doi_df[doi_df['abstract_match']])
stats[f'{key}_doi_good_matches_with_abstract_found'] = len(
abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])]
)
stats[f'{key}_doi_good_matches_without_abstract'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
) - len(abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])])
stats[f'{key}_total_doi_good_matches'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
)
# Ensure all stats are integers
stats = {k: int(v) for k, v in stats.items()}
# Get the time range of parsed data
parsed_times = full_df['parsed_time'].values
start = min(parsed_times)
end = max(parsed_times)
stats['period'] = end - start
stats['start'] = time.strftime('%d-%m-%Y', time.gmtime(start))
stats['end'] = time.strftime('%d-%m-%Y', time.gmtime(end))
return stats
def get_stats():
# Load post-processed data and full entry statistics
post_proc_df = pd.read_csv(f'{local_dir}/post_proc.csv').replace({float('nan'): ''})
post_proc_df['pdf_available'] = post_proc_df['pdf_available'].apply(
lambda x: False if x == '' else x
)
full_df = pd.read_csv(f'{local_dir}/entry_stats.csv').replace({float('nan'): ''})
# Filter data based on different matching criteria
full_rss = full_df[full_df['match'] == 1]
full_strict_rss = full_df[full_df['strict_regex'] > 2]
strict_rss_with_doi = post_proc_df[post_proc_df['strict_regex'] > 2]
full_relaxed_rss = full_df[(full_df['match'] == 1) & (full_df['strict_regex'] <= 2)]
relaxed_rss_with_doi = post_proc_df[
(post_proc_df['match'] == 1) & (post_proc_df['strict_regex'] <= 2)
]
# Store dataframes in a dictionary for easier access
dfs = {
'full': (full_rss, post_proc_df),
'strict_rss': (full_strict_rss, strict_rss_with_doi),
'relaxed_rss': (full_relaxed_rss, relaxed_rss_with_doi),
}
# Initialize statistics dictionary
match_df = post_proc_df[
(post_proc_df['abstract_match']) & (post_proc_df['doi_good_to_go'])
]
stats = {
'total': len(full_df),
'oa': len(match_df[match_df['pdf_available']]),
'non_oa': len(
match_df[match_df['pdf_url'] == '']
), # Non-OA with abstract match
'oa_no_info': len(match_df[match_df['pdf_url'].apply(lambda x: 'Error' in x)]),
} # OA with errors in URL
# Calculate statistics for each category (full, strict_rss, relaxed_rss)
for key, (fdf, doi_df) in dfs.items():
abs_df = doi_df[doi_df['abstract_found']]
stats[f'{key}_match'] = len(fdf)
stats[f'{key}_match_with_doi'] = len(doi_df)
stats[f'{key}_match_missing_doi'] = len(fdf) - len(doi_df)
stats[f'{key}_abstracts_found'] = len(abs_df)
stats[f'{key}_missing_abstracts'] = len(doi_df) - len(abs_df)
stats[f'{key}_strict_matches_with_abstract_found'] = len(
abs_df[abs_df['abstract_match']]
)
stats[f'{key}_strict_matches_without_abstract'] = len(
doi_df[doi_df['abstract_match']]
) - len(abs_df[abs_df['abstract_match']])
stats[f'{key}_total_strict_matches'] = len(doi_df[doi_df['abstract_match']])
stats[f'{key}_doi_good_matches_with_abstract_found'] = len(
abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])]
)
stats[f'{key}_doi_good_matches_without_abstract'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
) - len(abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])])
stats[f'{key}_total_doi_good_matches'] = len(
doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
)
# Ensure all stats are integers
stats = {k: int(v) for k, v in stats.items()}
# Get the time range of parsed data
parsed_times = full_df['parsed_time'].values
start = min(parsed_times)
end = max(parsed_times)
stats['period'] = end - start
stats['start'] = time.strftime('%d-%m-%Y', time.gmtime(start))
stats['end'] = time.strftime('%d-%m-%Y', time.gmtime(end))
return stats
3. Visualisation¶
In [5]:
Copied!
def complex_sankey():
# Get statistics from the data
stats = get_stats()
# Define labels for the Sankey diagram nodes
labels = [
'RSS Summary <br>Keyword Match',
'Un-Resolvable Reference',
'Abstract Found',
'Abstract Missing',
'Filtered Papers',
'Open-Access',
'Non Open-Access',
'Failed Retrieval',
]
# Define source and target nodes for the links
sources = [0, 0, 0, 2, 3, 4, 4, 4]
targets = [1, 2, 3, 4, 4, 5, 6, 7]
# Define the values (thickness) for the links based on calculated statistics
values = [
stats['full_match_missing_doi'], # Un-Resolvable Reference
stats['full_abstracts_found'], # Abstract Found
stats['full_missing_abstracts'], # Abstract Missing
stats['full_doi_good_matches_with_abstract_found'], # Keyword Match,
stats['full_doi_good_matches_without_abstract'], # Keyword Match
stats['oa'], # Open-Access
stats['non_oa'], # Non Open-Acess
stats['oa_no_info'], # Failed Retrieval
]
# Calculate the total value for each node to display in the label
node_values = []
for i in range(len(labels)):
v = 0
node_list = targets
if i not in targets:
node_list = sources
for j in range(len(node_list)):
v += values[j] if node_list[j] == i else 0
node_values.append(v)
# Format labels to include node values
labels = [f'<b>{i}<br>{v}</b>' for i, v in zip(labels, node_values)]
# Create the Sankey diagram figure
fig = go.Figure(
data=[
go.Sankey(
valueformat='.0f',
arrangement='snap',
node=dict(
pad=8,
thickness=10,
line=dict(color='black', width=0.5),
label=labels,
align='left',
),
link=dict(
source=sources,
target=targets,
value=values,
color='rgba(0,0,255,0.2)',
),
)
]
)
# Set a title for the plot including total papers and date range
fig.update_layout(
title_text=f'<b>{stats["total"]} papers parsed from {stats["start"]} to {stats["end"]}</b>',
font_size=12,
width=600,
)
# Display the plot
fig.show(renderer="notebook")
return fig, stats
def complex_sankey():
# Get statistics from the data
stats = get_stats()
# Define labels for the Sankey diagram nodes
labels = [
'RSS Summary
Keyword Match', 'Un-Resolvable Reference', 'Abstract Found', 'Abstract Missing', 'Filtered Papers', 'Open-Access', 'Non Open-Access', 'Failed Retrieval', ] # Define source and target nodes for the links sources = [0, 0, 0, 2, 3, 4, 4, 4] targets = [1, 2, 3, 4, 4, 5, 6, 7] # Define the values (thickness) for the links based on calculated statistics values = [ stats['full_match_missing_doi'], # Un-Resolvable Reference stats['full_abstracts_found'], # Abstract Found stats['full_missing_abstracts'], # Abstract Missing stats['full_doi_good_matches_with_abstract_found'], # Keyword Match, stats['full_doi_good_matches_without_abstract'], # Keyword Match stats['oa'], # Open-Access stats['non_oa'], # Non Open-Acess stats['oa_no_info'], # Failed Retrieval ] # Calculate the total value for each node to display in the label node_values = [] for i in range(len(labels)): v = 0 node_list = targets if i not in targets: node_list = sources for j in range(len(node_list)): v += values[j] if node_list[j] == i else 0 node_values.append(v) # Format labels to include node values labels = [f'{i}
{v}' for i, v in zip(labels, node_values)] # Create the Sankey diagram figure fig = go.Figure( data=[ go.Sankey( valueformat='.0f', arrangement='snap', node=dict( pad=8, thickness=10, line=dict(color='black', width=0.5), label=labels, align='left', ), link=dict( source=sources, target=targets, value=values, color='rgba(0,0,255,0.2)', ), ) ] ) # Set a title for the plot including total papers and date range fig.update_layout( title_text=f'{stats["total"]} papers parsed from {stats["start"]} to {stats["end"]}', font_size=12, width=600, ) # Display the plot fig.show(renderer="notebook") return fig, stats
Keyword Match', 'Un-Resolvable Reference', 'Abstract Found', 'Abstract Missing', 'Filtered Papers', 'Open-Access', 'Non Open-Access', 'Failed Retrieval', ] # Define source and target nodes for the links sources = [0, 0, 0, 2, 3, 4, 4, 4] targets = [1, 2, 3, 4, 4, 5, 6, 7] # Define the values (thickness) for the links based on calculated statistics values = [ stats['full_match_missing_doi'], # Un-Resolvable Reference stats['full_abstracts_found'], # Abstract Found stats['full_missing_abstracts'], # Abstract Missing stats['full_doi_good_matches_with_abstract_found'], # Keyword Match, stats['full_doi_good_matches_without_abstract'], # Keyword Match stats['oa'], # Open-Access stats['non_oa'], # Non Open-Acess stats['oa_no_info'], # Failed Retrieval ] # Calculate the total value for each node to display in the label node_values = [] for i in range(len(labels)): v = 0 node_list = targets if i not in targets: node_list = sources for j in range(len(node_list)): v += values[j] if node_list[j] == i else 0 node_values.append(v) # Format labels to include node values labels = [f'{i}
{v}' for i, v in zip(labels, node_values)] # Create the Sankey diagram figure fig = go.Figure( data=[ go.Sankey( valueformat='.0f', arrangement='snap', node=dict( pad=8, thickness=10, line=dict(color='black', width=0.5), label=labels, align='left', ), link=dict( source=sources, target=targets, value=values, color='rgba(0,0,255,0.2)', ), ) ] ) # Set a title for the plot including total papers and date range fig.update_layout( title_text=f'{stats["total"]} papers parsed from {stats["start"]} to {stats["end"]}', font_size=12, width=600, ) # Display the plot fig.show(renderer="notebook") return fig, stats
In [6]:
Copied!
fig, stats = complex_sankey()
fig, stats = complex_sankey()
4. Summary¶
In [ ]:
Copied!
f'The multi-stage filtering pipeline for identifying new perovskite solar cell papers. \
Over a {int(stats["period"] / 86400)}-day period ({stats["start"]} to {stats["end"]}), {stats["total"]} papers were parsed from Journal RSS feeds. \
An initial match against RSS summaries identified {stats["full_match"]} candidates. Subsequent steps remove papers with unresolvable DOIs (n = {stats["full_match_missing_doi"]}),\
and failing a secondary strict match (n = {stats["full_match"] - stats["full_total_strict_matches"] - stats["full_match_missing_doi"]}). \
Further filtering is done to exclude theoretical, computational and review works (n = {stats["full_total_strict_matches"] - stats["full_total_doi_good_matches"]}), yielding a final set of {stats["full_total_doi_good_matches"]} relevant papers of which {stats["oa"]} were open-access papers.'
f'The multi-stage filtering pipeline for identifying new perovskite solar cell papers. \
Over a {int(stats["period"] / 86400)}-day period ({stats["start"]} to {stats["end"]}), {stats["total"]} papers were parsed from Journal RSS feeds. \
An initial match against RSS summaries identified {stats["full_match"]} candidates. Subsequent steps remove papers with unresolvable DOIs (n = {stats["full_match_missing_doi"]}),\
and failing a secondary strict match (n = {stats["full_match"] - stats["full_total_strict_matches"] - stats["full_match_missing_doi"]}). \
Further filtering is done to exclude theoretical, computational and review works (n = {stats["full_total_strict_matches"] - stats["full_total_doi_good_matches"]}), yielding a final set of {stats["full_total_doi_good_matches"]} relevant papers of which {stats["oa"]} were open-access papers.'