In [ ]:

Copied!

# ruff: noqa: E402
# ruff: noqa: E402

Perovskite Papersbot Analysis

This notebook retrieves data from the Perovskite Papersbot and plots the filtering steps.

In [1]:

Copied!





import time

import pandas as pd
import plotly.graph_objects as go
from huggingface_hub import HfApi, snapshot_download
from plotly_theme import register_template, set_defaults  # type: ignore

# Register and set default Plotly theme for consistent styling
register_template()
set_defaults()
import time

import pandas as pd
import plotly.graph_objects as go
from huggingface_hub import HfApi, snapshot_download
from plotly_theme import register_template, set_defaults  # type: ignore

# Register and set default Plotly theme for consistent styling
register_template()
set_defaults()

/Users/pepemarquez/git/Pepe-Marquez/nomad-distro-dev/.venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Overview¶

Data Retrieval: Downloads Perovskite Papersbot run log files stored in Hugging Face repo pilar12/perovskite-papersbot.
Data Processing: Loads and processes the downloaded CSV files to extract statistics related to paper matching, abstract availability, and open-access status.
Visualisation: Generates a Sankey diagram to visually represent the multi-stage filtering pipeline for identifying new perovskite solar cell papers, showing the flow of papers through different stages of filtering.
Summary: Provides a textual summary of the filtering process, including the total number of papers parsed, initial matches, papers filtered out, and the final count of relevant and open-access papers.

1. Data Retrieval¶

In [2]:

Copied!





# Initialize Hugging Face API token and repository ID
repo_id = 'pilar12/perovskite-papersbot'
api = HfApi()
local_dir = 'paperbot_runs'


# Function to download files from the Hugging Face repository
def download_files():
    snapshot_path = snapshot_download(
        repo_id=repo_id,
        local_dir=local_dir,  # Local directory to save the downloaded files
        repo_type='dataset',
        revision='comb_regex',
        force_download=True,
    )
    return snapshot_path
# Initialize Hugging Face API token and repository ID
repo_id = 'pilar12/perovskite-papersbot'
api = HfApi()
local_dir = 'paperbot_runs'


# Function to download files from the Hugging Face repository
def download_files():
    snapshot_path = snapshot_download(
        repo_id=repo_id,
        local_dir=local_dir,  # Local directory to save the downloaded files
        repo_type='dataset',
        revision='comb_regex',
        force_download=True,
    )
    return snapshot_path

In [3]:

Copied!

download_files()
download_files()

Fetching 14 files: 100%|██████████| 14/14 [00:03<00:00,  3.89it/s]

Out[3]:

'/Users/pepemarquez/git/Pepe-Marquez/nomad-distro-dev/packages/nomad-perovskite-solar-cells-database/src/perovskite_solar_cell_database/example_uploads/perla_notebooks/paperbot_runs'

2. Data Processing¶

In [4]:

Copied!





def get_stats():
    # Load post-processed data and full entry statistics
    post_proc_df = pd.read_csv(f'{local_dir}/post_proc.csv').replace({float('nan'): ''})
    post_proc_df['pdf_available'] = post_proc_df['pdf_available'].apply(
        lambda x: False if x == '' else x
    )
    full_df = pd.read_csv(f'{local_dir}/entry_stats.csv').replace({float('nan'): ''})

    # Filter data based on different matching criteria
    full_rss = full_df[full_df['match'] == 1]
    full_strict_rss = full_df[full_df['strict_regex'] > 2]
    strict_rss_with_doi = post_proc_df[post_proc_df['strict_regex'] > 2]
    full_relaxed_rss = full_df[(full_df['match'] == 1) & (full_df['strict_regex'] <= 2)]
    relaxed_rss_with_doi = post_proc_df[
        (post_proc_df['match'] == 1) & (post_proc_df['strict_regex'] <= 2)
    ]

    # Store dataframes in a dictionary for easier access
    dfs = {
        'full': (full_rss, post_proc_df),
        'strict_rss': (full_strict_rss, strict_rss_with_doi),
        'relaxed_rss': (full_relaxed_rss, relaxed_rss_with_doi),
    }

    # Initialize statistics dictionary
    match_df = post_proc_df[
        (post_proc_df['abstract_match']) & (post_proc_df['doi_good_to_go'])
    ]
    stats = {
        'total': len(full_df),
        'oa': len(match_df[match_df['pdf_available']]),
        'non_oa': len(
            match_df[match_df['pdf_url'] == '']
        ),  # Non-OA with abstract match
        'oa_no_info': len(match_df[match_df['pdf_url'].apply(lambda x: 'Error' in x)]),
    }  # OA with errors in URL

    # Calculate statistics for each category (full, strict_rss, relaxed_rss)
    for key, (fdf, doi_df) in dfs.items():
        abs_df = doi_df[doi_df['abstract_found']]
        stats[f'{key}_match'] = len(fdf)
        stats[f'{key}_match_with_doi'] = len(doi_df)
        stats[f'{key}_match_missing_doi'] = len(fdf) - len(doi_df)
        stats[f'{key}_abstracts_found'] = len(abs_df)
        stats[f'{key}_missing_abstracts'] = len(doi_df) - len(abs_df)
        stats[f'{key}_strict_matches_with_abstract_found'] = len(
            abs_df[abs_df['abstract_match']]
        )
        stats[f'{key}_strict_matches_without_abstract'] = len(
            doi_df[doi_df['abstract_match']]
        ) - len(abs_df[abs_df['abstract_match']])
        stats[f'{key}_total_strict_matches'] = len(doi_df[doi_df['abstract_match']])
        stats[f'{key}_doi_good_matches_with_abstract_found'] = len(
            abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])]
        )
        stats[f'{key}_doi_good_matches_without_abstract'] = len(
            doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
        ) - len(abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])])
        stats[f'{key}_total_doi_good_matches'] = len(
            doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
        )

    # Ensure all stats are integers
    stats = {k: int(v) for k, v in stats.items()}

    # Get the time range of parsed data
    parsed_times = full_df['parsed_time'].values
    start = min(parsed_times)
    end = max(parsed_times)
    stats['period'] = end - start
    stats['start'] = time.strftime('%d-%m-%Y', time.gmtime(start))
    stats['end'] = time.strftime('%d-%m-%Y', time.gmtime(end))
    return stats
def get_stats():
    # Load post-processed data and full entry statistics
    post_proc_df = pd.read_csv(f'{local_dir}/post_proc.csv').replace({float('nan'): ''})
    post_proc_df['pdf_available'] = post_proc_df['pdf_available'].apply(
        lambda x: False if x == '' else x
    )
    full_df = pd.read_csv(f'{local_dir}/entry_stats.csv').replace({float('nan'): ''})

    # Filter data based on different matching criteria
    full_rss = full_df[full_df['match'] == 1]
    full_strict_rss = full_df[full_df['strict_regex'] > 2]
    strict_rss_with_doi = post_proc_df[post_proc_df['strict_regex'] > 2]
    full_relaxed_rss = full_df[(full_df['match'] == 1) & (full_df['strict_regex'] <= 2)]
    relaxed_rss_with_doi = post_proc_df[
        (post_proc_df['match'] == 1) & (post_proc_df['strict_regex'] <= 2)
    ]

    # Store dataframes in a dictionary for easier access
    dfs = {
        'full': (full_rss, post_proc_df),
        'strict_rss': (full_strict_rss, strict_rss_with_doi),
        'relaxed_rss': (full_relaxed_rss, relaxed_rss_with_doi),
    }

    # Initialize statistics dictionary
    match_df = post_proc_df[
        (post_proc_df['abstract_match']) & (post_proc_df['doi_good_to_go'])
    ]
    stats = {
        'total': len(full_df),
        'oa': len(match_df[match_df['pdf_available']]),
        'non_oa': len(
            match_df[match_df['pdf_url'] == '']
        ),  # Non-OA with abstract match
        'oa_no_info': len(match_df[match_df['pdf_url'].apply(lambda x: 'Error' in x)]),
    }  # OA with errors in URL

    # Calculate statistics for each category (full, strict_rss, relaxed_rss)
    for key, (fdf, doi_df) in dfs.items():
        abs_df = doi_df[doi_df['abstract_found']]
        stats[f'{key}_match'] = len(fdf)
        stats[f'{key}_match_with_doi'] = len(doi_df)
        stats[f'{key}_match_missing_doi'] = len(fdf) - len(doi_df)
        stats[f'{key}_abstracts_found'] = len(abs_df)
        stats[f'{key}_missing_abstracts'] = len(doi_df) - len(abs_df)
        stats[f'{key}_strict_matches_with_abstract_found'] = len(
            abs_df[abs_df['abstract_match']]
        )
        stats[f'{key}_strict_matches_without_abstract'] = len(
            doi_df[doi_df['abstract_match']]
        ) - len(abs_df[abs_df['abstract_match']])
        stats[f'{key}_total_strict_matches'] = len(doi_df[doi_df['abstract_match']])
        stats[f'{key}_doi_good_matches_with_abstract_found'] = len(
            abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])]
        )
        stats[f'{key}_doi_good_matches_without_abstract'] = len(
            doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
        ) - len(abs_df[(abs_df['abstract_match']) & (abs_df['doi_good_to_go'])])
        stats[f'{key}_total_doi_good_matches'] = len(
            doi_df[(doi_df['abstract_match']) & (doi_df['doi_good_to_go'])]
        )

    # Ensure all stats are integers
    stats = {k: int(v) for k, v in stats.items()}

    # Get the time range of parsed data
    parsed_times = full_df['parsed_time'].values
    start = min(parsed_times)
    end = max(parsed_times)
    stats['period'] = end - start
    stats['start'] = time.strftime('%d-%m-%Y', time.gmtime(start))
    stats['end'] = time.strftime('%d-%m-%Y', time.gmtime(end))
    return stats

3. Visualisation¶

In [5]:

Copied!





def complex_sankey():
    # Get statistics from the data
    stats = get_stats()

    # Define labels for the Sankey diagram nodes
    labels = [
        'RSS Summary <br>Keyword Match',
        'Un-Resolvable Reference',
        'Abstract Found',
        'Abstract Missing',
        'Filtered Papers',
        'Open-Access',
        'Non Open-Access',
        'Failed Retrieval',
    ]

    # Define source and target nodes for the links
    sources = [0, 0, 0, 2, 3, 4, 4, 4]
    targets = [1, 2, 3, 4, 4, 5, 6, 7]

    # Define the values (thickness) for the links based on calculated statistics
    values = [
        stats['full_match_missing_doi'],  # Un-Resolvable Reference
        stats['full_abstracts_found'],  # Abstract Found
        stats['full_missing_abstracts'],  # Abstract Missing
        stats['full_doi_good_matches_with_abstract_found'],  # Keyword Match,
        stats['full_doi_good_matches_without_abstract'],  # Keyword Match
        stats['oa'],  # Open-Access
        stats['non_oa'],  # Non Open-Acess
        stats['oa_no_info'],  # Failed Retrieval
    ]

    # Calculate the total value for each node to display in the label
    node_values = []
    for i in range(len(labels)):
        v = 0
        node_list = targets
        if i not in targets:
            node_list = sources
        for j in range(len(node_list)):
            v += values[j] if node_list[j] == i else 0
        node_values.append(v)

    # Format labels to include node values
    labels = [f'<b>{i}<br>{v}</b>' for i, v in zip(labels, node_values)]

    # Create the Sankey diagram figure
    fig = go.Figure(
        data=[
            go.Sankey(
                valueformat='.0f',
                arrangement='snap',
                node=dict(
                    pad=8,
                    thickness=10,
                    line=dict(color='black', width=0.5),
                    label=labels,
                    align='left',
                ),
                link=dict(
                    source=sources,
                    target=targets,
                    value=values,
                    color='rgba(0,0,255,0.2)',
                ),
            )
        ]
    )

    # Set a title for the plot including total papers and date range
    fig.update_layout(
        title_text=f'<b>{stats["total"]} papers parsed  from {stats["start"]} to {stats["end"]}</b>',
        font_size=12,
        width=600,
    )

    # Display the plot
    fig.show(renderer="notebook")
    return fig, stats
def complex_sankey():
    # Get statistics from the data
    stats = get_stats()

    # Define labels for the Sankey diagram nodes
    labels = [
        'RSS Summary 
Keyword Match',
        'Un-Resolvable Reference',
        'Abstract Found',
        'Abstract Missing',
        'Filtered Papers',
        'Open-Access',
        'Non Open-Access',
        'Failed Retrieval',
    ]

    # Define source and target nodes for the links
    sources = [0, 0, 0, 2, 3, 4, 4, 4]
    targets = [1, 2, 3, 4, 4, 5, 6, 7]

    # Define the values (thickness) for the links based on calculated statistics
    values = [
        stats['full_match_missing_doi'],  # Un-Resolvable Reference
        stats['full_abstracts_found'],  # Abstract Found
        stats['full_missing_abstracts'],  # Abstract Missing
        stats['full_doi_good_matches_with_abstract_found'],  # Keyword Match,
        stats['full_doi_good_matches_without_abstract'],  # Keyword Match
        stats['oa'],  # Open-Access
        stats['non_oa'],  # Non Open-Acess
        stats['oa_no_info'],  # Failed Retrieval
    ]

    # Calculate the total value for each node to display in the label
    node_values = []
    for i in range(len(labels)):
        v = 0
        node_list = targets
        if i not in targets:
            node_list = sources
        for j in range(len(node_list)):
            v += values[j] if node_list[j] == i else 0
        node_values.append(v)

    # Format labels to include node values
    labels = [f'{i}
{v}' for i, v in zip(labels, node_values)]

    # Create the Sankey diagram figure
    fig = go.Figure(
        data=[
            go.Sankey(
                valueformat='.0f',
                arrangement='snap',
                node=dict(
                    pad=8,
                    thickness=10,
                    line=dict(color='black', width=0.5),
                    label=labels,
                    align='left',
                ),
                link=dict(
                    source=sources,
                    target=targets,
                    value=values,
                    color='rgba(0,0,255,0.2)',
                ),
            )
        ]
    )

    # Set a title for the plot including total papers and date range
    fig.update_layout(
        title_text=f'{stats["total"]} papers parsed  from {stats["start"]} to {stats["end"]}',
        font_size=12,
        width=600,
    )

    # Display the plot
    fig.show(renderer="notebook")
    return fig, stats

In [6]:

Copied!

fig, stats = complex_sankey()
fig, stats = complex_sankey()

4. Summary¶

In [ ]:

Copied!





f'The multi-stage filtering pipeline for identifying new perovskite solar cell papers. \
Over a {int(stats["period"] / 86400)}-day period ({stats["start"]} to {stats["end"]}), {stats["total"]} papers were parsed from Journal RSS feeds. \
An initial match against RSS summaries identified {stats["full_match"]} candidates. Subsequent steps remove papers with unresolvable DOIs (n = {stats["full_match_missing_doi"]}),\
and failing a secondary strict match (n = {stats["full_match"] - stats["full_total_strict_matches"] - stats["full_match_missing_doi"]}). \
Further filtering is done to exclude theoretical, computational and review works (n = {stats["full_total_strict_matches"] - stats["full_total_doi_good_matches"]}), yielding a final set of {stats["full_total_doi_good_matches"]} relevant papers of which {stats["oa"]} were open-access papers.'
f'The multi-stage filtering pipeline for identifying new perovskite solar cell papers. \
Over a {int(stats["period"] / 86400)}-day period ({stats["start"]} to {stats["end"]}), {stats["total"]} papers were parsed from Journal RSS feeds. \
An initial match against RSS summaries identified {stats["full_match"]} candidates. Subsequent steps remove papers with unresolvable DOIs (n = {stats["full_match_missing_doi"]}),\
and failing a secondary strict match (n = {stats["full_match"] - stats["full_total_strict_matches"] - stats["full_match_missing_doi"]}). \
Further filtering is done to exclude theoretical, computational and review works (n = {stats["full_total_strict_matches"] - stats["full_total_doi_good_matches"]}), yielding a final set of {stats["full_total_doi_good_matches"]} relevant papers of which {stats["oa"]} were open-access papers.'