In [ ]:

Copied!

# ruff: noqa: E402, F601, E741, I001, UP015
# ruff: noqa: E402, F601, E741, I001, UP015

PERLA Evaluation Analysis

This notebook evaluates the performance of the PERLA extraction pipeline by computing extraction metrics against ground truth data.

Overview¶

This notebook evaluates the performance of the PERLA (Perovskite Extraction and Research Literature Analysis) extraction pipeline by comparing extracted data against a ground truth dataset. The evaluation covers multiple Large Language Models (LLMs) and compares their extraction performance across different data fields.

Evaluation Methodology¶

The evaluation uses a confusion matrix approach:

True Positives (TP): Fields correctly extracted and matching ground truth
False Positives (FP): Fields extracted but not present in ground truth
False Negatives (FN): Fields in ground truth but not extracted

Metrics calculated:

Precision = TP / (TP + FP) — Measures extraction accuracy
Recall = TP / (TP + FN) — Measures extraction completeness
F1 Score = 2 × (Precision × Recall) / (Precision + Recall) — Harmonic mean of precision and recall

Setup and Evaluations¶

The evaluation is done by comparing the extracted data to a ground truth dataset. Sometimes, the scoring will use an LLM to score the extracted data.

For this reason, we need API keys for the LLMs we are using.

Imports & Setup¶

In [1]:

Copied!





# --- Imports ---
import json
import os
from importlib.resources import files
from math import pi
from pathlib import Path


import dabest
import litellm
from litellm.caching.caching import Cache

litellm.cache = Cache(type='disk')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from tqdm import tqdm

# Third-party libraries
from dotenv import load_dotenv


# Internal modules
# Ensure 'perovscribe' is accessible in the repo structure
from perovscribe.pipeline import ExtractionPipeline
from plotly_theme import register_template, set_defaults, MODEL_COLORS

# --- Configuration & Theme ---
load_dotenv()  # Loads .env if present
register_template()
set_defaults()

plt.rcParams.update(
    {
        'font.family': 'Arial',
        'font.size': 10,
        'axes.labelsize': 11,
        'axes.titlesize': 12,
        'xtick.labelsize': 10,
        'ytick.labelsize': 10,
        'legend.fontsize': 10,
        'figure.titlesize': 12,
        'axes.linewidth': 1,
        'axes.edgecolor': 'black',
        'axes.facecolor': 'white',
        'figure.facecolor': 'white',
        'grid.color': 'lightgray',
        'grid.linewidth': 0.5,
        'axes.grid': False,
        'axes.spines.top': True,
        'axes.spines.right': True,
        'savefig.dpi': 300,
        'savefig.bbox': 'tight',
        'savefig.facecolor': 'white',
    }
)
# Define Paths (Use relative paths for reproducibility!)
DATA_DIR = files('perovscribe').joinpath('data')
EXTRACTIONS_DIR = DATA_DIR / 'extractions'
GROUND_TRUTH_DIR = DATA_DIR / 'ground_truth' / 'test'
DEV_DIR = DATA_DIR / 'ground_truth' / 'dev'
EXPERTS_DIR = EXTRACTIONS_DIR / 'humans' / 'Consensus'
# MODEL_COLORS is imported from plotly_theme (single source of truth for colors)
# --- Imports ---
import json
import os
from importlib.resources import files
from math import pi
from pathlib import Path


import dabest
import litellm
from litellm.caching.caching import Cache

litellm.cache = Cache(type='disk')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from tqdm import tqdm

# Third-party libraries
from dotenv import load_dotenv


# Internal modules
# Ensure 'perovscribe' is accessible in the repo structure
from perovscribe.pipeline import ExtractionPipeline
from plotly_theme import register_template, set_defaults, MODEL_COLORS

# --- Configuration & Theme ---
load_dotenv()  # Loads .env if present
register_template()
set_defaults()

plt.rcParams.update(
    {
        'font.family': 'Arial',
        'font.size': 10,
        'axes.labelsize': 11,
        'axes.titlesize': 12,
        'xtick.labelsize': 10,
        'ytick.labelsize': 10,
        'legend.fontsize': 10,
        'figure.titlesize': 12,
        'axes.linewidth': 1,
        'axes.edgecolor': 'black',
        'axes.facecolor': 'white',
        'figure.facecolor': 'white',
        'grid.color': 'lightgray',
        'grid.linewidth': 0.5,
        'axes.grid': False,
        'axes.spines.top': True,
        'axes.spines.right': True,
        'savefig.dpi': 300,
        'savefig.bbox': 'tight',
        'savefig.facecolor': 'white',
    }
)
# Define Paths (Use relative paths for reproducibility!)
DATA_DIR = files('perovscribe').joinpath('data')
EXTRACTIONS_DIR = DATA_DIR / 'extractions'
GROUND_TRUTH_DIR = DATA_DIR / 'ground_truth' / 'test'
DEV_DIR = DATA_DIR / 'ground_truth' / 'dev'
EXPERTS_DIR = EXTRACTIONS_DIR / 'humans' / 'Consensus'
# MODEL_COLORS is imported from plotly_theme (single source of truth for colors)

Pre-compiling numba functions for DABEST...

Compiling numba functions: 100%|██████████| 11/11 [00:00<00:00, 61.77it/s]

Numba compilation complete!

22:39:14 - LiteLLM:ERROR: redis_cache.py:178 - Error connecting to Sync Redis client

22:39:15 - LiteLLM:ERROR: redis_cache.py:1081 - LiteLLM Redis Cache PING: - Got exception from REDIS : Error 61 connecting to 127.0.0.1:6379. Connect call failed ('127.0.0.1', 6379).

In [2]:

Copied!





# ============================================================================
# COUNT DEVICES IN TEST AND VALIDATION SETS
# ============================================================================


def count_devices_in_directory(ground_truth_dir):
    """
    Count the number of devices in a ground truth directory.

    Each JSON file contains a "cells" list, where each element represents a device.
    This function counts the length of the "cells" list in each file.

    Args:
        ground_truth_dir: Path to ground truth directory

    Returns:
        dict with 'files' (number of JSON files), 'total_devices' (sum of len(cells)),
        and 'papers' (number of unique papers/DOIs)
    """
    if not ground_truth_dir.exists():
        return {'files': 0, 'total_devices': 0, 'papers': 0}

    json_files = list(ground_truth_dir.glob('*.json'))
    num_files = len(json_files)

    # Count devices from "cells" list in each JSON file
    total_devices = 0
    papers_with_devices = set()

    for json_file in json_files:
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)

            # Count devices: length of "cells" list
            if isinstance(data, dict) and 'cells' in data:
                devices_in_file = len(data['cells'])
                total_devices += devices_in_file

                # Extract DOI from filename (format: DOI.json)
                # Filename format is like "10.1002--adma.202305822.json"
                doi = json_file.stem.replace('--', '/')
                papers_with_devices.add(doi)
            else:
                # Fallback: if structure is unexpected, count as 1 device
                total_devices += 1

        except (json.JSONDecodeError, KeyError, TypeError) as e:
            # If file can't be parsed, skip it
            print(f'Warning: Could not parse {json_file.name}: {e}')

    return {
        'files': num_files,
        'total_devices': total_devices,
        'papers': len(papers_with_devices) if papers_with_devices else num_files,
    }


# Count devices in test and validation sets
test_counts = count_devices_in_directory(GROUND_TRUTH_DIR)
dev_counts = count_devices_in_directory(DEV_DIR)

print('=' * 60)
print('DEVICE COUNTS IN DATASETS')
print('=' * 60)
print('\nTest Set:')
print(f'  Number of files: {test_counts["files"]}')
print(f'  Estimated devices: {test_counts["total_devices"]}')
print(f'  Unique papers: {test_counts["papers"]}')

print('\nValidation Set (dev):')
print(f'  Number of files: {dev_counts["files"]}')
print(f'  Estimated devices: {dev_counts["total_devices"]}')
print(f'  Unique papers: {dev_counts["papers"]}')

print('\nTotal:')
print(f'  Files: {test_counts["files"] + dev_counts["files"]}')
print(f'  Devices: {test_counts["total_devices"] + dev_counts["total_devices"]}')
print(f'  Papers: {test_counts["papers"] + dev_counts["papers"]}')
print('=' * 60)
# ============================================================================
# COUNT DEVICES IN TEST AND VALIDATION SETS
# ============================================================================


def count_devices_in_directory(ground_truth_dir):
    """
    Count the number of devices in a ground truth directory.

    Each JSON file contains a "cells" list, where each element represents a device.
    This function counts the length of the "cells" list in each file.

    Args:
        ground_truth_dir: Path to ground truth directory

    Returns:
        dict with 'files' (number of JSON files), 'total_devices' (sum of len(cells)),
        and 'papers' (number of unique papers/DOIs)
    """
    if not ground_truth_dir.exists():
        return {'files': 0, 'total_devices': 0, 'papers': 0}

    json_files = list(ground_truth_dir.glob('*.json'))
    num_files = len(json_files)

    # Count devices from "cells" list in each JSON file
    total_devices = 0
    papers_with_devices = set()

    for json_file in json_files:
        try:
            with open(json_file, 'r') as f:
                data = json.load(f)

            # Count devices: length of "cells" list
            if isinstance(data, dict) and 'cells' in data:
                devices_in_file = len(data['cells'])
                total_devices += devices_in_file

                # Extract DOI from filename (format: DOI.json)
                # Filename format is like "10.1002--adma.202305822.json"
                doi = json_file.stem.replace('--', '/')
                papers_with_devices.add(doi)
            else:
                # Fallback: if structure is unexpected, count as 1 device
                total_devices += 1

        except (json.JSONDecodeError, KeyError, TypeError) as e:
            # If file can't be parsed, skip it
            print(f'Warning: Could not parse {json_file.name}: {e}')

    return {
        'files': num_files,
        'total_devices': total_devices,
        'papers': len(papers_with_devices) if papers_with_devices else num_files,
    }


# Count devices in test and validation sets
test_counts = count_devices_in_directory(GROUND_TRUTH_DIR)
dev_counts = count_devices_in_directory(DEV_DIR)

print('=' * 60)
print('DEVICE COUNTS IN DATASETS')
print('=' * 60)
print('\nTest Set:')
print(f'  Number of files: {test_counts["files"]}')
print(f'  Estimated devices: {test_counts["total_devices"]}')
print(f'  Unique papers: {test_counts["papers"]}')

print('\nValidation Set (dev):')
print(f'  Number of files: {dev_counts["files"]}')
print(f'  Estimated devices: {dev_counts["total_devices"]}')
print(f'  Unique papers: {dev_counts["papers"]}')

print('\nTotal:')
print(f'  Files: {test_counts["files"] + dev_counts["files"]}')
print(f'  Devices: {test_counts["total_devices"] + dev_counts["total_devices"]}')
print(f'  Papers: {test_counts["papers"] + dev_counts["papers"]}')
print('=' * 60)

============================================================
DEVICE COUNTS IN DATASETS
============================================================

Test Set:
  Number of files: 20
  Estimated devices: 101
  Unique papers: 20

Validation Set (dev):
  Number of files: 10
  Estimated devices: 35
  Unique papers: 10

Total:
  Files: 30
  Devices: 136
  Papers: 30
============================================================

Model Configurations¶

In [3]:

Copied!





# Define model metadata: Display Names, Colors, and Token Costs (per 1M tokens)
# Prices are examples; verify current API pricing.
# Colors are obtained from MODEL_COLORS (imported from plotly_theme)
MODEL_CONFIG = {
    'gpt-5-2025-08-07': {
        'name': 'GPT-5',
        'color': MODEL_COLORS['GPT-5'],
    },
    'gpt-5-mini-2025-08-07': {
        'name': 'GPT-5 Mini',
        'color': MODEL_COLORS['GPT-5 Mini'],
    },
    'claude-opus-4-20250514': {
        'name': 'Claude Opus 4',
        'color': MODEL_COLORS['Claude Opus 4'],
    },
    'claude-sonnet-4-20250514': {
        'name': 'Claude Sonnet 4',
        'color': MODEL_COLORS['Claude Sonnet 4'],
    },
    'claude-opus-4-1-20250805': {
        'name': 'Claude Opus 4.1',
        'color': MODEL_COLORS['Claude Opus 4.1'],
    },
    'gpt-4.1-2025-04-14': {
        'name': 'GPT-4.1',
        'color': MODEL_COLORS['GPT-4.1'],
    },
    'gpt-4o-2024-08-06': {
        'name': 'GPT-4o',
        'color': MODEL_COLORS['GPT-4o'],
    },
}
# Define model metadata: Display Names, Colors, and Token Costs (per 1M tokens)
# Prices are examples; verify current API pricing.
# Colors are obtained from MODEL_COLORS (imported from plotly_theme)
MODEL_CONFIG = {
    'gpt-5-2025-08-07': {
        'name': 'GPT-5',
        'color': MODEL_COLORS['GPT-5'],
    },
    'gpt-5-mini-2025-08-07': {
        'name': 'GPT-5 Mini',
        'color': MODEL_COLORS['GPT-5 Mini'],
    },
    'claude-opus-4-20250514': {
        'name': 'Claude Opus 4',
        'color': MODEL_COLORS['Claude Opus 4'],
    },
    'claude-sonnet-4-20250514': {
        'name': 'Claude Sonnet 4',
        'color': MODEL_COLORS['Claude Sonnet 4'],
    },
    'claude-opus-4-1-20250805': {
        'name': 'Claude Opus 4.1',
        'color': MODEL_COLORS['Claude Opus 4.1'],
    },
    'gpt-4.1-2025-04-14': {
        'name': 'GPT-4.1',
        'color': MODEL_COLORS['GPT-4.1'],
    },
    'gpt-4o-2024-08-06': {
        'name': 'GPT-4o',
        'color': MODEL_COLORS['GPT-4o'],
    },
}

Evaluations¶

Evals Code¶

In [ ]:

Copied!





# ============================================================================
# DATA LOADING AND MODEL EVALUATION
# ============================================================================

all_metrics = {}  # model_name -> paper_doi -> {field: score}
all_precs_and_recalls = {}

# Evaluate all models
for model_dir in tqdm(
    EXTRACTIONS_DIR.iterdir(), total=len([d for d in EXTRACTIONS_DIR.iterdir()])
):
    if not model_dir.is_dir() or model_dir == 'humans':
        continue

    model_name = model_dir.name
    print(f'Evaluating model: {model_name}')

    pipeline = ExtractionPipeline(
        model_name=model_name,
        preprocessor='pymupdf',
        postprocessor='NONE',
        cache_dir='',
        use_cache=True,
    )
    model_metrics, avg_recalls, avg_precisions = pipeline._evaluate_multiple(
        model_dir, GROUND_TRUTH_DIR
    )

    all_precs_and_recalls[model_name] = {
        'precision': avg_precisions,
        'recall': avg_recalls,
    }
    all_metrics[model_name] = model_metrics

# Rename models to readable names
model_name_map = {
    'claude-opus-4-1-20250805': 'Claude Opus 4.1',
    'claude-opus-4-20250514': 'Claude Opus 4',
    'claude-sonnet-4-20250514': 'Claude Sonnet 4',
    'gpt-4.1-2025-04-14': 'GPT-4.1',
    'gpt-4o-2024-08-06': 'GPT-4o',
    'gpt-5-2025-08-07': 'GPT-5',
    'gpt-5-mini-2025-08-07': 'GPT-5 Mini',
}

all_metrics = {model_name_map.get(k, k): v for k, v in all_metrics.items()}
# ============================================================================
# DATA LOADING AND MODEL EVALUATION
# ============================================================================

all_metrics = {}  # model_name -> paper_doi -> {field: score}
all_precs_and_recalls = {}

# Evaluate all models
for model_dir in tqdm(
    EXTRACTIONS_DIR.iterdir(), total=len([d for d in EXTRACTIONS_DIR.iterdir()])
):
    if not model_dir.is_dir() or model_dir == 'humans':
        continue

    model_name = model_dir.name
    print(f'Evaluating model: {model_name}')

    pipeline = ExtractionPipeline(
        model_name=model_name,
        preprocessor='pymupdf',
        postprocessor='NONE',
        cache_dir='',
        use_cache=True,
    )
    model_metrics, avg_recalls, avg_precisions = pipeline._evaluate_multiple(
        model_dir, GROUND_TRUTH_DIR
    )

    all_precs_and_recalls[model_name] = {
        'precision': avg_precisions,
        'recall': avg_recalls,
    }
    all_metrics[model_name] = model_metrics

# Rename models to readable names
model_name_map = {
    'claude-opus-4-1-20250805': 'Claude Opus 4.1',
    'claude-opus-4-20250514': 'Claude Opus 4',
    'claude-sonnet-4-20250514': 'Claude Sonnet 4',
    'gpt-4.1-2025-04-14': 'GPT-4.1',
    'gpt-4o-2024-08-06': 'GPT-4o',
    'gpt-5-2025-08-07': 'GPT-5',
    'gpt-5-mini-2025-08-07': 'GPT-5 Mini',
}

all_metrics = {model_name_map.get(k, k): v for k, v in all_metrics.items()}

Helper Functions¶

In [32]:

Copied!





# ============================================================================
# HELPER FUNCTIONS (DATAFRAME VERSION)
# ============================================================================


def metrics_to_dataframe(metrics_dict):
    """
    Convert nested metrics dictionary to a flat DataFrame.

    Returns:
        DataFrame with columns: model, paper, field, TP, FP, FN
    """
    rows = []
    for model, papers in metrics_dict.items():
        for paper, fields in papers.items():
            for field, values in fields.items():
                if isinstance(values, dict):
                    rows.append(
                        {
                            'model': model,
                            'paper': paper,
                            'field': field,
                            'TP': values.get('TP', 0.0),
                            'FP': values.get('FP', 0.0),
                            'FN': values.get('FN', 0.0),
                        }
                    )
    return pd.DataFrame(rows)


def add_field_categories(df):
    """Add aggregation category for each field."""

    def categorize(field):  # noqa: PLR0911
        if field.endswith(':unit'):
            return 'units'
        field_lower = field.lower()
        if 'composition' in field_lower:
            return 'composition'
        if 'stability' in field_lower:
            return 'stability'
        if 'deposition' in field_lower:
            return 'deposition'
        if 'layers' in field_lower:
            return 'layers'
        if 'light' in field_lower:
            return 'light'
        # Clean up individual fields
        if any(
            x in field
            for x in ['averaged_quantities', 'number_devices', 'encapsulated']
        ):
            return None
        return field.replace('_', ' ').split(':value')[0]

    df['category'] = df['field'].apply(categorize)
    return df[df['category'].notna()]


def calculate_metrics(df, metric_type='recall'):
    """
    Calculate precision or recall for each row.

    Args:
        df: DataFrame with TP, FP, FN columns
        metric_type: 'recall' or 'precision'
    """
    if metric_type == 'recall':
        df['score'] = df.apply(
            lambda row: row['TP'] / (row['TP'] + row['FN'])
            if (row['TP'] + row['FN']) > 0
            else np.nan,
            axis=1,
        )
    else:  # precision
        df['score'] = df.apply(
            lambda row: row['TP'] / (row['TP'] + row['FP'])
            if (row['TP'] + row['FP']) > 0
            else np.nan,
            axis=1,
        )
    return df
# ============================================================================
# HELPER FUNCTIONS (DATAFRAME VERSION)
# ============================================================================


def metrics_to_dataframe(metrics_dict):
    """
    Convert nested metrics dictionary to a flat DataFrame.

    Returns:
        DataFrame with columns: model, paper, field, TP, FP, FN
    """
    rows = []
    for model, papers in metrics_dict.items():
        for paper, fields in papers.items():
            for field, values in fields.items():
                if isinstance(values, dict):
                    rows.append(
                        {
                            'model': model,
                            'paper': paper,
                            'field': field,
                            'TP': values.get('TP', 0.0),
                            'FP': values.get('FP', 0.0),
                            'FN': values.get('FN', 0.0),
                        }
                    )
    return pd.DataFrame(rows)


def add_field_categories(df):
    """Add aggregation category for each field."""

    def categorize(field):  # noqa: PLR0911
        if field.endswith(':unit'):
            return 'units'
        field_lower = field.lower()
        if 'composition' in field_lower:
            return 'composition'
        if 'stability' in field_lower:
            return 'stability'
        if 'deposition' in field_lower:
            return 'deposition'
        if 'layers' in field_lower:
            return 'layers'
        if 'light' in field_lower:
            return 'light'
        # Clean up individual fields
        if any(
            x in field
            for x in ['averaged_quantities', 'number_devices', 'encapsulated']
        ):
            return None
        return field.replace('_', ' ').split(':value')[0]

    df['category'] = df['field'].apply(categorize)
    return df[df['category'].notna()]


def calculate_metrics(df, metric_type='recall'):
    """
    Calculate precision or recall for each row.

    Args:
        df: DataFrame with TP, FP, FN columns
        metric_type: 'recall' or 'precision'
    """
    if metric_type == 'recall':
        df['score'] = df.apply(
            lambda row: row['TP'] / (row['TP'] + row['FN'])
            if (row['TP'] + row['FN']) > 0
            else np.nan,
            axis=1,
        )
    else:  # precision
        df['score'] = df.apply(
            lambda row: row['TP'] / (row['TP'] + row['FP'])
            if (row['TP'] + row['FP']) > 0
            else np.nan,
            axis=1,
        )
    return df

Visualize¶

Overall Performance¶

In [33]:

Copied!





# ============================================================================
# BAR CHART: OVERALL MODEL PERFORMANCE (DATAFRAME VERSION)
# ============================================================================

# Calculate overall metrics per model
df = metrics_to_dataframe(all_metrics)

df_doi = (
    df.groupby(['model', 'paper'])
    .agg({'TP': 'sum', 'FP': 'sum', 'FN': 'sum'})
    .reset_index()
)

overall = df_doi.groupby('model').sum().reset_index()
overall['precision'] = overall['TP'] / (overall['TP'] + overall['FP'])
overall['recall'] = overall['TP'] / (overall['TP'] + overall['FN'])


# Plot
x = np.arange(len(overall))
width = 0.35

overall_performance_fig, ax = plt.subplots(figsize=(7.2, 4))
rects1 = ax.bar(x - width / 2, overall['precision'], width, label='Precision')
rects2 = ax.bar(x + width / 2, overall['recall'], width, label='Recall')

ax.set_ylabel('Score')
ax.set_title('Model Performances')
ax.set_xticks(x)
ax.set_xticklabels(overall['model'], rotation=45)
ax.set_yticks(np.arange(0, 1.1, 0.4))
ax.set_yticklabels([f'{y:.1f}' for y in np.arange(0, 1.1, 0.4)])
ax.set_ylim(0, 1.2)
ax.legend(loc='upper right', ncol=2, frameon=False)

# Add value labels
for rects, values in [(rects1, overall['precision']), (rects2, overall['recall'])]:
    for rect, val in zip(rects, values):
        height = rect.get_height()
        ax.text(
            rect.get_x() + rect.get_width() / 2.0,
            height + 0.02,
            f'{val:.2f}',
            ha='center',
            va='bottom',
        )

plt.tight_layout()
overall_performance_fig.savefig('overall_performance_bars.pdf')
# ============================================================================
# BAR CHART: OVERALL MODEL PERFORMANCE (DATAFRAME VERSION)
# ============================================================================

# Calculate overall metrics per model
df = metrics_to_dataframe(all_metrics)

df_doi = (
    df.groupby(['model', 'paper'])
    .agg({'TP': 'sum', 'FP': 'sum', 'FN': 'sum'})
    .reset_index()
)

overall = df_doi.groupby('model').sum().reset_index()
overall['precision'] = overall['TP'] / (overall['TP'] + overall['FP'])
overall['recall'] = overall['TP'] / (overall['TP'] + overall['FN'])


# Plot
x = np.arange(len(overall))
width = 0.35

overall_performance_fig, ax = plt.subplots(figsize=(7.2, 4))
rects1 = ax.bar(x - width / 2, overall['precision'], width, label='Precision')
rects2 = ax.bar(x + width / 2, overall['recall'], width, label='Recall')

ax.set_ylabel('Score')
ax.set_title('Model Performances')
ax.set_xticks(x)
ax.set_xticklabels(overall['model'], rotation=45)
ax.set_yticks(np.arange(0, 1.1, 0.4))
ax.set_yticklabels([f'{y:.1f}' for y in np.arange(0, 1.1, 0.4)])
ax.set_ylim(0, 1.2)
ax.legend(loc='upper right', ncol=2, frameon=False)

# Add value labels
for rects, values in [(rects1, overall['precision']), (rects2, overall['recall'])]:
    for rect, val in zip(rects, values):
        height = rect.get_height()
        ax.text(
            rect.get_x() + rect.get_width() / 2.0,
            height + 0.02,
            f'{val:.2f}',
            ha='center',
            va='bottom',
        )

plt.tight_layout()
overall_performance_fig.savefig('overall_performance_bars.pdf')

No description has been provided for this image

Radar Plot: Recall/Precision per field¶

In [50]:

Copied!





# Convert to DataFrame and calculate precisions
df = metrics_to_dataframe(all_metrics)
df = add_field_categories(df)
df_recall = df.copy()
df_precision = df.copy()
df_precision = calculate_metrics(df_precision, metric_type='precision')
df_recall = calculate_metrics(df_recall, metric_type='recall')

# Aggregate by model and category
aggregated_precision = (
    df_precision.groupby(['model', 'category'])['score'].mean().reset_index()
)
aggregated_recall = (
    df_recall.groupby(['model', 'category'])['score'].mean().reset_index()
)

# Pivot for radar plot
pivot_precision = aggregated_precision.pivot(
    index='model', columns='category', values='score'
).fillna(0)
pivot_recall = aggregated_recall.pivot(
    index='model', columns='category', values='score'
).fillna(0)


# Prepare Fields
fields = sorted(pivot_precision.columns)
num_fields = len(fields)
angles = [n / float(num_fields) * 2 * pi for n in range(num_fields)]
angles += angles[:1]  # Close the loop


# Formatter
def format_field_name(field):
    field_map = {
        'pce': 'PCE',
        'jsc': r'$J_\mathrm{sc}$',
        'ff': 'FF',
        'voc': r'$V_\mathrm{oc}$',
        'active area': 'Active Area',
        'layers': 'Layers',
        'device architecture': 'Architecture',
        'composition': 'Composition',
        'deposition': 'Deposition',
        'stability': 'Stability',
        'units': 'Units',
        'light': 'Light',
    }
    return field_map.get(field.lower(), field)


field_labels = [format_field_name(f) for f in fields]

# ============================================================================
# 3. PLOTTING FUNCTION
# ============================================================================


def draw_radar(ax, pivot_df, title, letter_label, ylim=(0, 1)):
    # A. Setup Grid & Spines
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    ax.spines['polar'].set_visible(False)
    # Very light, thin grid
    ax.grid(color='#D9D9D9', linestyle='--', linewidth=0.5)

    # B. Plot Data (Thinner lines for clarity)
    for model_name in pivot_df.index:
        if model_name not in MODEL_COLORS:
            print(f'Warning: Unknown model {model_name}, skipping.')
            continue  # Skip unknown models
        scores = pivot_df.loc[model_name, fields].tolist()
        values = scores + [scores[0]]
        color = MODEL_COLORS.get(model_name, '#333333')

        ax.plot(
            angles,
            values,
            linewidth=1.0,  # Nature standard line width
            linestyle='-',
            marker='o',
            markersize=4.5,  # Small, crisp markers
            markeredgewidth=0,  # No border on markers for cleanliness
            label=model_name,
            color=color,
        )

        # Extremely subtle fill to keep grid visible
        ax.fill(angles, values, color=color, alpha=0.01)

    # C. Smart Labels (Compact & Aligned)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([])

    # Distance: Closer to center because fonts are smaller (1.15 is sufficient now)
    label_distance = 1.15

    for angle, label in zip(angles[:-1], field_labels):
        angle_norm = angle % (2 * np.pi)

        if np.isclose(angle_norm, 0):  # 12 o'clock
            ha, va = 'center', 'bottom'
        elif 0 < angle_norm < np.pi:  # Right
            ha, va = 'left', 'center'
        elif np.isclose(angle_norm, np.pi):  # 6 o'clock
            ha, va = 'center', 'top'
        else:  # Left
            ha, va = 'right', 'center'

        # Font size 7pt (standard text)
        ax.text(
            angle,
            label_distance,
            label,
            horizontalalignment=ha,
            verticalalignment=va,
            color='black',
        )

    # D. Y-Axis (Radial) - Minimalist
    ax.set_ylim(ylim)
    ax.set_yticklabels([])

    # Minimal ticks: Only show max and mid-point to reduce clutter
    mid_val = (ylim[0] + ylim[1]) / 2
    ticks = [mid_val, ylim[1]]
    r_angle = np.deg2rad(22.5)  # Draw ticks at 22.5 deg to avoid crossing vertical axis

    for y in ticks:
        ax.text(
            r_angle,
            y,
            f'{y:.1f}',
            color='#404040',
            size=6,
            ha='center',
            va='center',
            bbox=dict(facecolor='white', edgecolor='none', alpha=0.5, pad=0.5),
        )

    ax.set_title(title, y=1.2, fontweight='bold', color='black')

    ax.text(
        -0.3,
        1.3,
        letter_label,
        transform=ax.transAxes,
        fontsize=14,
        fontweight='bold',
        va='top',
        ha='right',
        color='black',
    )


# ============================================================================
# 4. GENERATE FIGURE
# ============================================================================

# Width: 7.2 inches (183mm) exactly
# Height: 3.5 inches (approx 89mm) - compact aspect ratio
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7.2, 3.5), subplot_kw={'polar': True})

# Draw Plots
draw_radar(ax1, pivot_precision, 'Precision', 'a', ylim=(0.0, 1.0))
draw_radar(ax2, pivot_recall, 'Recall', 'b', ylim=(0.0, 1.0))

# Shared Legend
handles, labels = ax1.get_legend_handles_labels()
# Very compact legend
fig.legend(
    handles,
    labels,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.1),
    ncol=4,  # 4 columns to be wider/flatter
    frameon=False,
    columnspacing=1.5,
    handletextpad=0.5,
)

# Tight layout with manual adjustments for the legend space
plt.subplots_adjust(
    top=0.82,
    bottom=0.20,  # Space for legend
    left=0.08,
    right=0.92,
    wspace=0.85,
)

# Save as PDF (Vector format required by Nature)
plt.savefig('performance_radar_plot.pdf', dpi=300, format='pdf')
# Convert to DataFrame and calculate precisions
df = metrics_to_dataframe(all_metrics)
df = add_field_categories(df)
df_recall = df.copy()
df_precision = df.copy()
df_precision = calculate_metrics(df_precision, metric_type='precision')
df_recall = calculate_metrics(df_recall, metric_type='recall')

# Aggregate by model and category
aggregated_precision = (
    df_precision.groupby(['model', 'category'])['score'].mean().reset_index()
)
aggregated_recall = (
    df_recall.groupby(['model', 'category'])['score'].mean().reset_index()
)

# Pivot for radar plot
pivot_precision = aggregated_precision.pivot(
    index='model', columns='category', values='score'
).fillna(0)
pivot_recall = aggregated_recall.pivot(
    index='model', columns='category', values='score'
).fillna(0)


# Prepare Fields
fields = sorted(pivot_precision.columns)
num_fields = len(fields)
angles = [n / float(num_fields) * 2 * pi for n in range(num_fields)]
angles += angles[:1]  # Close the loop


# Formatter
def format_field_name(field):
    field_map = {
        'pce': 'PCE',
        'jsc': r'$J_\mathrm{sc}$',
        'ff': 'FF',
        'voc': r'$V_\mathrm{oc}$',
        'active area': 'Active Area',
        'layers': 'Layers',
        'device architecture': 'Architecture',
        'composition': 'Composition',
        'deposition': 'Deposition',
        'stability': 'Stability',
        'units': 'Units',
        'light': 'Light',
    }
    return field_map.get(field.lower(), field)


field_labels = [format_field_name(f) for f in fields]

# ============================================================================
# 3. PLOTTING FUNCTION
# ============================================================================


def draw_radar(ax, pivot_df, title, letter_label, ylim=(0, 1)):
    # A. Setup Grid & Spines
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    ax.spines['polar'].set_visible(False)
    # Very light, thin grid
    ax.grid(color='#D9D9D9', linestyle='--', linewidth=0.5)

    # B. Plot Data (Thinner lines for clarity)
    for model_name in pivot_df.index:
        if model_name not in MODEL_COLORS:
            print(f'Warning: Unknown model {model_name}, skipping.')
            continue  # Skip unknown models
        scores = pivot_df.loc[model_name, fields].tolist()
        values = scores + [scores[0]]
        color = MODEL_COLORS.get(model_name, '#333333')

        ax.plot(
            angles,
            values,
            linewidth=1.0,  # Nature standard line width
            linestyle='-',
            marker='o',
            markersize=4.5,  # Small, crisp markers
            markeredgewidth=0,  # No border on markers for cleanliness
            label=model_name,
            color=color,
        )

        # Extremely subtle fill to keep grid visible
        ax.fill(angles, values, color=color, alpha=0.01)

    # C. Smart Labels (Compact & Aligned)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([])

    # Distance: Closer to center because fonts are smaller (1.15 is sufficient now)
    label_distance = 1.15

    for angle, label in zip(angles[:-1], field_labels):
        angle_norm = angle % (2 * np.pi)

        if np.isclose(angle_norm, 0):  # 12 o'clock
            ha, va = 'center', 'bottom'
        elif 0 < angle_norm < np.pi:  # Right
            ha, va = 'left', 'center'
        elif np.isclose(angle_norm, np.pi):  # 6 o'clock
            ha, va = 'center', 'top'
        else:  # Left
            ha, va = 'right', 'center'

        # Font size 7pt (standard text)
        ax.text(
            angle,
            label_distance,
            label,
            horizontalalignment=ha,
            verticalalignment=va,
            color='black',
        )

    # D. Y-Axis (Radial) - Minimalist
    ax.set_ylim(ylim)
    ax.set_yticklabels([])

    # Minimal ticks: Only show max and mid-point to reduce clutter
    mid_val = (ylim[0] + ylim[1]) / 2
    ticks = [mid_val, ylim[1]]
    r_angle = np.deg2rad(22.5)  # Draw ticks at 22.5 deg to avoid crossing vertical axis

    for y in ticks:
        ax.text(
            r_angle,
            y,
            f'{y:.1f}',
            color='#404040',
            size=6,
            ha='center',
            va='center',
            bbox=dict(facecolor='white', edgecolor='none', alpha=0.5, pad=0.5),
        )

    ax.set_title(title, y=1.2, fontweight='bold', color='black')

    ax.text(
        -0.3,
        1.3,
        letter_label,
        transform=ax.transAxes,
        fontsize=14,
        fontweight='bold',
        va='top',
        ha='right',
        color='black',
    )


# ============================================================================
# 4. GENERATE FIGURE
# ============================================================================

# Width: 7.2 inches (183mm) exactly
# Height: 3.5 inches (approx 89mm) - compact aspect ratio
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7.2, 3.5), subplot_kw={'polar': True})

# Draw Plots
draw_radar(ax1, pivot_precision, 'Precision', 'a', ylim=(0.0, 1.0))
draw_radar(ax2, pivot_recall, 'Recall', 'b', ylim=(0.0, 1.0))

# Shared Legend
handles, labels = ax1.get_legend_handles_labels()
# Very compact legend
fig.legend(
    handles,
    labels,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.1),
    ncol=4,  # 4 columns to be wider/flatter
    frameon=False,
    columnspacing=1.5,
    handletextpad=0.5,
)

# Tight layout with manual adjustments for the legend space
plt.subplots_adjust(
    top=0.82,
    bottom=0.20,  # Space for legend
    left=0.08,
    right=0.92,
    wspace=0.85,
)

# Save as PDF (Vector format required by Nature)
plt.savefig('performance_radar_plot.pdf', dpi=300, format='pdf')

In [35]:

Copied!





fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7.2, 2.5), sharey=True)

models = [
    'Claude Sonnet 4',
    'Claude Opus 4.1',
    'Claude Opus 4',
    'GPT-5',
    'GPT-5 Mini',
    'GPT-4.1',
    'GPT-4o',
]
models = [m for m in models if m in pivot_precision.index]

x = np.arange(len(models))
width = 0.6

# Precision
means_p = [pivot_precision.loc[m].mean() for m in models]
stds_p = [pivot_precision.loc[m].std() for m in models]
ax1.barh(
    x,
    means_p,
    xerr=stds_p,
    height=width,
    color='#4DBBD5',
    capsize=2,
    error_kw={'linewidth': 0.8},
)
ax1.set_xlim(0, 1.1)
ax1.set_xlabel('Precision', fontsize=9)
ax1.set_yticks(x)
ax1.set_yticklabels(models, fontsize=8)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.text(-0.25, 1.05, 'a', transform=ax1.transAxes, fontsize=12, fontweight='bold')

# Recall
means_r = [pivot_recall.loc[m].mean() for m in models]
stds_r = [pivot_recall.loc[m].std() for m in models]
ax2.barh(
    x,
    means_r,
    xerr=stds_r,
    height=width,
    color='#E64B35',
    capsize=2,
    error_kw={'linewidth': 0.8},
)
ax2.set_xlim(0, 1.1)
ax2.set_xlabel('Recall', fontsize=9)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.text(-0.1, 1.05, 'b', transform=ax2.transAxes, fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('performance_bars.pdf', dpi=300)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(7.2, 2.5), sharey=True)

models = [
    'Claude Sonnet 4',
    'Claude Opus 4.1',
    'Claude Opus 4',
    'GPT-5',
    'GPT-5 Mini',
    'GPT-4.1',
    'GPT-4o',
]
models = [m for m in models if m in pivot_precision.index]

x = np.arange(len(models))
width = 0.6

# Precision
means_p = [pivot_precision.loc[m].mean() for m in models]
stds_p = [pivot_precision.loc[m].std() for m in models]
ax1.barh(
    x,
    means_p,
    xerr=stds_p,
    height=width,
    color='#4DBBD5',
    capsize=2,
    error_kw={'linewidth': 0.8},
)
ax1.set_xlim(0, 1.1)
ax1.set_xlabel('Precision', fontsize=9)
ax1.set_yticks(x)
ax1.set_yticklabels(models, fontsize=8)
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.text(-0.25, 1.05, 'a', transform=ax1.transAxes, fontsize=12, fontweight='bold')

# Recall
means_r = [pivot_recall.loc[m].mean() for m in models]
stds_r = [pivot_recall.loc[m].std() for m in models]
ax2.barh(
    x,
    means_r,
    xerr=stds_r,
    height=width,
    color='#E64B35',
    capsize=2,
    error_kw={'linewidth': 0.8},
)
ax2.set_xlim(0, 1.1)
ax2.set_xlabel('Recall', fontsize=9)
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.text(-0.1, 1.05, 'b', transform=ax2.transAxes, fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig('performance_bars.pdf', dpi=300)

Comparison with Experts¶

Evaluation Code¶

In [ ]:

Copied!





pipeline = ExtractionPipeline(
    model_name='Consensus',
    preprocessor='pymupdf',
    postprocessor='NONE',
    cache_dir='',
    use_cache=True,
)
authors_metrics, authors_recalls, authors_precisions = pipeline._evaluate_multiple(
    EXPERTS_DIR, GROUND_TRUTH_DIR
)
pipeline = ExtractionPipeline(
    model_name='Consensus',
    preprocessor='pymupdf',
    postprocessor='NONE',
    cache_dir='',
    use_cache=True,
)
authors_metrics, authors_recalls, authors_precisions = pipeline._evaluate_multiple(
    EXPERTS_DIR, GROUND_TRUTH_DIR
)

In [37]:

Copied!





all_metrics['Consensus'] = authors_metrics
experts_included_df = metrics_to_dataframe(all_metrics)

# 1. Get the set of papers that appear with model == "Consensus"
expert_papers = set(
    experts_included_df.loc[experts_included_df['model'] == 'Consensus', 'paper']
)

# 2. Filter the DataFrame
filtered_df = experts_included_df[
    (experts_included_df['model'] == 'Consensus')
    | (
        (experts_included_df['model'] != 'Consensus')
        & (experts_included_df['paper'].isin(expert_papers))
    )
]
all_metrics['Consensus'] = authors_metrics
experts_included_df = metrics_to_dataframe(all_metrics)

# 1. Get the set of papers that appear with model == "Consensus"
expert_papers = set(
    experts_included_df.loc[experts_included_df['model'] == 'Consensus', 'paper']
)

# 2. Filter the DataFrame
filtered_df = experts_included_df[
    (experts_included_df['model'] == 'Consensus')
    | (
        (experts_included_df['model'] != 'Consensus')
        & (experts_included_df['paper'].isin(expert_papers))
    )
]

In [38]:

Copied!





# Group by paper and model, sum TP and FP
micro_precision_df = (
    filtered_df.groupby(['paper', 'model'])[['TP', 'FP']].sum().reset_index()
)

# Compute micro-precision
micro_precision_df['precision'] = micro_precision_df['TP'] / (
    micro_precision_df['TP'] + micro_precision_df['FP']
)
# Group by paper and model, sum TP and FP
micro_precision_df = (
    filtered_df.groupby(['paper', 'model'])[['TP', 'FP']].sum().reset_index()
)

# Compute micro-precision
micro_precision_df['precision'] = micro_precision_df['TP'] / (
    micro_precision_df['TP'] + micro_precision_df['FP']
)

In [39]:

Copied!





# Only select the papers where both LLMs and experts exist
papers_with_both = micro_precision_df['paper'].value_counts()
papers_with_both = papers_with_both[papers_with_both > 1].index
df_plot = micro_precision_df[micro_precision_df['paper'].isin(papers_with_both)]

# Pivot data so each row is a DOI and each column is a model
df_pivot = df_plot.pivot(
    index='paper', columns='model', values='precision'
).reset_index()

# Melt data for dabest
df_melt = df_pivot.melt(id_vars='paper', var_name='model', value_name='precision')

# Create a dabest object using authors as the control
dabest_data = dabest.load(
    data=df_melt,
    x='model',
    y='precision',
    idx=(
        'Consensus',
        'GPT-4.1',
        'Claude Opus 4',
        'GPT-4o',
        'GPT-5',
        'Claude Sonnet 4',
        'Claude Opus 4.1',
        'GPT-5 Mini',
    ),
)

# Plot mean difference against authors
plt.figure()
mean_fig = dabest_data.mean_diff.plot(
    raw_marker_size=4,
    custom_palette=MODEL_COLORS,
)
# Only select the papers where both LLMs and experts exist
papers_with_both = micro_precision_df['paper'].value_counts()
papers_with_both = papers_with_both[papers_with_both > 1].index
df_plot = micro_precision_df[micro_precision_df['paper'].isin(papers_with_both)]

# Pivot data so each row is a DOI and each column is a model
df_pivot = df_plot.pivot(
    index='paper', columns='model', values='precision'
).reset_index()

# Melt data for dabest
df_melt = df_pivot.melt(id_vars='paper', var_name='model', value_name='precision')

# Create a dabest object using authors as the control
dabest_data = dabest.load(
    data=df_melt,
    x='model',
    y='precision',
    idx=(
        'Consensus',
        'GPT-4.1',
        'Claude Opus 4',
        'GPT-4o',
        'GPT-5',
        'Claude Sonnet 4',
        'Claude Opus 4.1',
        'GPT-5 Mini',
    ),
)

# Plot mean difference against authors
plt.figure()
mean_fig = dabest_data.mean_diff.plot(
    raw_marker_size=4,
    custom_palette=MODEL_COLORS,
)

<Figure size 640x480 with 0 Axes>

Overlapping extractions spider plots¶

In [ ]:

Copied!





human_metrics = {}  # model_name -> paper_doi -> {field: score}
human_precs_and_recalls = {}

HUMANS_DIR = EXTRACTIONS_DIR / 'humans'
DEV_DIR = DATA_DIR / 'ground_truth' / 'dev'
# Evaluate all models
for model_dir in HUMANS_DIR.iterdir():
    if not model_dir.is_dir():
        continue

    model_name = model_dir.name
    print(f'Evaluating model: {model_name}')

    pipeline = ExtractionPipeline(
        model_name=model_name,
        preprocessor='pymupdf',
        postprocessor='NONE',
        cache_dir='',
        use_cache=True,
    )
    model_metrics, avg_recalls, avg_precisions = pipeline._evaluate_multiple(
        model_dir, DEV_DIR
    )

    human_precs_and_recalls[model_name] = {
        'precision': avg_precisions,
        'recall': avg_recalls,
    }
    human_metrics[model_name] = model_metrics

sonnet_4_metrics, s_rec, s_prec = pipeline._evaluate_multiple(
    EXTRACTIONS_DIR / 'claude-sonnet-4-20250514/', DEV_DIR
)
human_metrics['Claude Sonnet 4'] = sonnet_4_metrics
human_metrics = {}  # model_name -> paper_doi -> {field: score}
human_precs_and_recalls = {}

HUMANS_DIR = EXTRACTIONS_DIR / 'humans'
DEV_DIR = DATA_DIR / 'ground_truth' / 'dev'
# Evaluate all models
for model_dir in HUMANS_DIR.iterdir():
    if not model_dir.is_dir():
        continue

    model_name = model_dir.name
    print(f'Evaluating model: {model_name}')

    pipeline = ExtractionPipeline(
        model_name=model_name,
        preprocessor='pymupdf',
        postprocessor='NONE',
        cache_dir='',
        use_cache=True,
    )
    model_metrics, avg_recalls, avg_precisions = pipeline._evaluate_multiple(
        model_dir, DEV_DIR
    )

    human_precs_and_recalls[model_name] = {
        'precision': avg_precisions,
        'recall': avg_recalls,
    }
    human_metrics[model_name] = model_metrics

sonnet_4_metrics, s_rec, s_prec = pipeline._evaluate_multiple(
    EXTRACTIONS_DIR / 'claude-sonnet-4-20250514/', DEV_DIR
)
human_metrics['Claude Sonnet 4'] = sonnet_4_metrics

What dois match across our extractions¶

In [41]:

Copied!





from collections import defaultdict

doi_to_groups = defaultdict(list)

for group, dois in human_metrics.items():
    for doi in dois:
        doi_to_groups[doi].append(group)

print('DOI matches across groups:\n')
for doi, groups in doi_to_groups.items():
    if len(groups) > 1:
        print(f'{doi} -> {", ".join(groups)}')
from collections import defaultdict

doi_to_groups = defaultdict(list)

for group, dois in human_metrics.items():
    for doi in dois:
        doi_to_groups[doi].append(group)

print('DOI matches across groups:\n')
for doi, groups in doi_to_groups.items():
    if len(groups) > 1:
        print(f'{doi} -> {", ".join(groups)}')

DOI matches across groups:

10.1021--acs.chemmater.8b01521.json -> Consensus, Kit, Seal, Robin, Claude Sonnet 4
10.1021--acsaem.9b01928.json -> Consensus, Bee, Panda, Seal, Dove, Claude Sonnet 4
10.1002--adfm.201904856.json -> Consensus, Bee, Fox, Kit, Claude Sonnet 4
10.1039--c7nr04692h.json -> Consensus, Bear, Kit, Dove, Claude Sonnet 4
10.1002--adma.202302143.json -> Consensus, Bee, Robin, Claude Sonnet 4
10.1002--solr.201900370.json -> Consensus, Bear, Lark, Kit, Claude Sonnet 4
10.1039--c5dt02388b.json -> Consensus, Bear, Kit, Otto, Claude Sonnet 4
10.1016--j.nanoen.2016.05.023.json -> Consensus, Bear, Panda, Lark, Seal, Dove, Hawk, Claude Sonnet 4
10.1002--adfm.201500335.json -> Consensus, Bee, Panda, Dove, Claude Sonnet 4
10.1016--j.matlet.2016.07.004.json -> Consensus, Bear, Lark, Dove, Claude Sonnet 4

In [48]:

Copied!





import numpy as np
import matplotlib.pyplot as plt
from math import pi
from matplotlib.lines import Line2D

# ---------------------------------------------------------------------
# 1) DATA PREP
# ---------------------------------------------------------------------
df = metrics_to_dataframe(human_metrics)
df = add_field_categories(df)
df = calculate_metrics(df, metric_type='precision')

aggregated = df.groupby(['model', 'category'])['score'].mean().reset_index()
pivot_df = aggregated.pivot(index='model', columns='category', values='score').fillna(0)

fields = sorted(pivot_df.columns)
num_fields = len(fields)
angles = [n / float(num_fields) * 2 * pi for n in range(num_fields)]
angles += angles[:1]

field_labels = [format_field_name(f) for f in fields]

# Separate human annotators from models
highlight_models = ['Consensus', 'Claude Sonnet 4']
human_names = [m for m in pivot_df.index if m not in highlight_models]


# ---------------------------------------------------------------------
# 2) PLOTTING
# ---------------------------------------------------------------------
def draw_radar_with_humans(ax, pivot_df, title='Precision', ylim=(0.3, 1.0)):
    # Grid & spines
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    ax.spines['polar'].set_visible(False)
    ax.grid(color='#D9D9D9', linestyle='--', linewidth=0.5)

    # A) Plot individual humans (light gray, visible but background)
    for human in human_names:
        scores = pivot_df.loc[human, fields].tolist()
        values = scores + [scores[0]]
        ax.plot(
            angles,
            values,
            color='#AAAAAA',
            linewidth=1.0,
            alpha=0.8,
            linestyle='-',
            zorder=1,
        )

    # B) Plot highlighted models
    model_styles = {
        'Claude Sonnet 4': {'color': '#E67E22', 'lw': 2.0, 'ms': 3.5, 'zorder': 10},
        'Consensus': {'color': '#1A5276', 'lw': 1.8, 'ms': 3.0, 'zorder': 9},
    }

    for model_name in highlight_models:
        if model_name not in pivot_df.index:
            continue
        scores = pivot_df.loc[model_name, fields].tolist()
        values = scores + [scores[0]]
        style = model_styles.get(model_name)

        ax.plot(
            angles,
            values,
            linewidth=style['lw'],
            linestyle='-',
            marker='o',
            markersize=style['ms'],
            markeredgewidth=0,
            color=style['color'],
            zorder=style['zorder'],
        )

    # C) Axis limits
    ax.set_ylim(ylim)

    # D) X-axis labels (manual placement)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([])

    label_distance = 1.12
    for angle, label in zip(angles[:-1], field_labels):
        angle_norm = angle % (2 * np.pi)
        if np.isclose(angle_norm, 0):
            ha, va = 'center', 'bottom'
        elif 0 < angle_norm < np.pi:
            ha, va = 'left', 'center'
        elif np.isclose(angle_norm, np.pi):
            ha, va = 'center', 'top'
        else:
            ha, va = 'right', 'center'

        ax.text(
            angle,
            label_distance,
            label,
            ha=ha,
            va=va,
            color='black',
            fontsize=7,
        )

    # E) Radial tick labels
    ax.set_yticklabels([])
    mid_val = (ylim[0] + ylim[1]) / 2
    for y in [mid_val, ylim[1]]:
        ax.text(
            np.deg2rad(22.5),
            y,
            f'{y:.1f}',
            color='#404040',
            size=6,
            ha='center',
            va='center',
            bbox=dict(facecolor='white', edgecolor='none', alpha=0.8, pad=0.5),
        )

    # F) Title
    ax.set_title(title, y=1.12, fontweight='bold', fontsize=10)


# ---------------------------------------------------------------------
# 3) CREATE FIGURE
# ---------------------------------------------------------------------
fig = plt.figure(figsize=(4, 4))
ax = plt.subplot(111, polar=True)

draw_radar_with_humans(ax, pivot_df, title='Precision', ylim=(0.3, 1.0))

# Custom legend
legend_elements = [
    Line2D(
        [0],
        [0],
        color='#E67E22',
        linewidth=2,
        marker='o',
        markersize=3.5,
        markeredgewidth=0,
        label='Claude Sonnet 4',
    ),
    Line2D(
        [0],
        [0],
        color='#1A5276',
        linewidth=1.8,
        marker='o',
        markersize=3,
        markeredgewidth=0,
        label='Consensus',
    ),
    Line2D(
        [0], [0], color='#AAAAAA', linewidth=1.0, alpha=0.6, label='Individual humans'
    ),
]

fig.legend(
    handles=legend_elements,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.02),
    ncol=3,
    frameon=False,
    fontsize=7,
    handlelength=1.5,
    columnspacing=1.2,
)

plt.tight_layout()
plt.subplots_adjust(bottom=0.12)
plt.savefig('human_precision_radar_v3.pdf', dpi=300, bbox_inches='tight')
plt.show()
import numpy as np
import matplotlib.pyplot as plt
from math import pi
from matplotlib.lines import Line2D

# ---------------------------------------------------------------------
# 1) DATA PREP
# ---------------------------------------------------------------------
df = metrics_to_dataframe(human_metrics)
df = add_field_categories(df)
df = calculate_metrics(df, metric_type='precision')

aggregated = df.groupby(['model', 'category'])['score'].mean().reset_index()
pivot_df = aggregated.pivot(index='model', columns='category', values='score').fillna(0)

fields = sorted(pivot_df.columns)
num_fields = len(fields)
angles = [n / float(num_fields) * 2 * pi for n in range(num_fields)]
angles += angles[:1]

field_labels = [format_field_name(f) for f in fields]

# Separate human annotators from models
highlight_models = ['Consensus', 'Claude Sonnet 4']
human_names = [m for m in pivot_df.index if m not in highlight_models]


# ---------------------------------------------------------------------
# 2) PLOTTING
# ---------------------------------------------------------------------
def draw_radar_with_humans(ax, pivot_df, title='Precision', ylim=(0.3, 1.0)):
    # Grid & spines
    ax.set_theta_offset(pi / 2)
    ax.set_theta_direction(-1)
    ax.spines['polar'].set_visible(False)
    ax.grid(color='#D9D9D9', linestyle='--', linewidth=0.5)

    # A) Plot individual humans (light gray, visible but background)
    for human in human_names:
        scores = pivot_df.loc[human, fields].tolist()
        values = scores + [scores[0]]
        ax.plot(
            angles,
            values,
            color='#AAAAAA',
            linewidth=1.0,
            alpha=0.8,
            linestyle='-',
            zorder=1,
        )

    # B) Plot highlighted models
    model_styles = {
        'Claude Sonnet 4': {'color': '#E67E22', 'lw': 2.0, 'ms': 3.5, 'zorder': 10},
        'Consensus': {'color': '#1A5276', 'lw': 1.8, 'ms': 3.0, 'zorder': 9},
    }

    for model_name in highlight_models:
        if model_name not in pivot_df.index:
            continue
        scores = pivot_df.loc[model_name, fields].tolist()
        values = scores + [scores[0]]
        style = model_styles.get(model_name)

        ax.plot(
            angles,
            values,
            linewidth=style['lw'],
            linestyle='-',
            marker='o',
            markersize=style['ms'],
            markeredgewidth=0,
            color=style['color'],
            zorder=style['zorder'],
        )

    # C) Axis limits
    ax.set_ylim(ylim)

    # D) X-axis labels (manual placement)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([])

    label_distance = 1.12
    for angle, label in zip(angles[:-1], field_labels):
        angle_norm = angle % (2 * np.pi)
        if np.isclose(angle_norm, 0):
            ha, va = 'center', 'bottom'
        elif 0 < angle_norm < np.pi:
            ha, va = 'left', 'center'
        elif np.isclose(angle_norm, np.pi):
            ha, va = 'center', 'top'
        else:
            ha, va = 'right', 'center'

        ax.text(
            angle,
            label_distance,
            label,
            ha=ha,
            va=va,
            color='black',
            fontsize=7,
        )

    # E) Radial tick labels
    ax.set_yticklabels([])
    mid_val = (ylim[0] + ylim[1]) / 2
    for y in [mid_val, ylim[1]]:
        ax.text(
            np.deg2rad(22.5),
            y,
            f'{y:.1f}',
            color='#404040',
            size=6,
            ha='center',
            va='center',
            bbox=dict(facecolor='white', edgecolor='none', alpha=0.8, pad=0.5),
        )

    # F) Title
    ax.set_title(title, y=1.12, fontweight='bold', fontsize=10)


# ---------------------------------------------------------------------
# 3) CREATE FIGURE
# ---------------------------------------------------------------------
fig = plt.figure(figsize=(4, 4))
ax = plt.subplot(111, polar=True)

draw_radar_with_humans(ax, pivot_df, title='Precision', ylim=(0.3, 1.0))

# Custom legend
legend_elements = [
    Line2D(
        [0],
        [0],
        color='#E67E22',
        linewidth=2,
        marker='o',
        markersize=3.5,
        markeredgewidth=0,
        label='Claude Sonnet 4',
    ),
    Line2D(
        [0],
        [0],
        color='#1A5276',
        linewidth=1.8,
        marker='o',
        markersize=3,
        markeredgewidth=0,
        label='Consensus',
    ),
    Line2D(
        [0], [0], color='#AAAAAA', linewidth=1.0, alpha=0.6, label='Individual humans'
    ),
]

fig.legend(
    handles=legend_elements,
    loc='lower center',
    bbox_to_anchor=(0.5, -0.02),
    ncol=3,
    frameon=False,
    fontsize=7,
    handlelength=1.5,
    columnspacing=1.2,
)

plt.tight_layout()
plt.subplots_adjust(bottom=0.12)
plt.savefig('human_precision_radar_v3.pdf', dpi=300, bbox_inches='tight')
plt.show()