In [48]:

"hide-cell"

Copied!

# ruff: noqa: E402, F601
# ruff: noqa: E402, F601

Temporal Evolution of Bandgap and Absorber Composition in Perovskite Solar Cells

This notebook analyzes the temporal evolution of optical bandgap and absorber composition in perovskite solar cells using data from the Perovskite Database in NOMAD.

In [1]:

Copied!

from plotly_theme import register_template, set_defaults

register_template()
set_defaults()
from plotly_theme import register_template, set_defaults

register_template()
set_defaults()

Setup and Data Loading¶

In [2]:

Copied!

# Load the data from the parquet file into a DataFrame
import pandas as pd

df = pd.read_parquet("perovskite_solar_cell_database.parquet")
# Load the data from the parquet file into a DataFrame
import pandas as pd

df = pd.read_parquet("perovskite_solar_cell_database.parquet")

In [3]:

Copied!

# Set the source_database column: if data.ref.person_entering_data is 'LLM Extraction', then 'LLM Extracted', else 'Manual Entry'
df['source_database'] = df['data.ref.name_of_person_entering_the_data'].apply(lambda x: 'LLM Extracted' if x == 'LLM Extraction' else 'Manual Entry')

# Convert band gap from Joules to eV (1 eV = 1.60218e-19 J)
df["results.properties.electronic.band_gap.0.value"] = df["results.properties.electronic.band_gap.0.value"].apply(lambda x: x / 1.60218e-19 if pd.notnull(x) else x)
# Set the source_database column: if data.ref.person_entering_data is 'LLM Extraction', then 'LLM Extracted', else 'Manual Entry'
df['source_database'] = df['data.ref.name_of_person_entering_the_data'].apply(lambda x: 'LLM Extracted' if x == 'LLM Extraction' else 'Manual Entry')

# Convert band gap from Joules to eV (1 eV = 1.60218e-19 J)
df["results.properties.electronic.band_gap.0.value"] = df["results.properties.electronic.band_gap.0.value"].apply(lambda x: x / 1.60218e-19 if pd.notnull(x) else x)

Bandgap Evolution Over Time¶

The reported bandgap values have evolved systematically over publication years. The link below opens a filtered view in the NOMAD dashboard. You can use the scatter plot widget to view the data from the figure directly in NOMAD. Below is a YAML snippet that you can use to reproduce a helpful dashboard to explore the influence of different cation compositions.

Dashboard YAML

- type: scatter_plot
  autorange: true
  size: 10000
  markers:
    color:
      search_quantity: data.perovskite.composition_a_ions#perovskite_solar_cell_database.schema.PerovskiteSolarCell
  y:
    search_quantity: results.properties.electronic.band_gap[0].value
    title: Bandgap
  x:
    search_quantity: data.ref.publication_date#perovskite_solar_cell_database.schema.PerovskiteSolarCell
  layout:
    xxl:
      minH: 3
      minW: 3
      h: 6
      w: 9
      y: 0
      x: Infinity
    xl:
      minH: 3
      minW: 3
      h: 6
      w: 9
      y: 0
      x: 0
    lg:
      minH: 3
      minW: 3
      h: 6
      w: 9
      y: 0
      x: 0
    md:
      minH: 3
      minW: 3
      h: 6
      w: 12
      y: 0
      x: 0
    sm:
      minH: 3
      minW: 3
      h: 6
      w: 9
      y: 0
      x: 0
- type: terms
  show_input: false
  scale: linear
  search_quantity: data.perovskite.composition_a_ions#perovskite_solar_cell_database.schema.PerovskiteSolarCell
  layout:
    xxl:
      minH: 3
      minW: 3
      h: 9
      w: 6
      y: 0
      x: Infinity
    xl:
      minH: 3
      minW: 3
      h: 9
      w: 6
      y: 0
      x: 9
    lg:
      minH: 3
      minW: 3
      h: 9
      w: 6
      y: 0
      x: 9
    md:
      minH: 3
      minW: 3
      h: 6
      w: 5
      y: 0
      x: 12
    sm:
      minH: 3
      minW: 3
      h: 9
      w: 6
      y: 6
      x: 0

In [4]:

Copied!





# Investigate the evolution of bandgaps over the years

import plotly.express as px

fig = px.scatter(df,
                 x="data.ref.publication_date",
                 y="results.properties.electronic.band_gap.0.value",
                 color="source_database",
                 labels={"data.ref.publication_date": "Publication year", "results.properties.electronic.band_gap[0].value": "Bandgap (eV)", "source_database": ""},
                 opacity=0.5,)
fig.update_layout(
    yaxis_title="Bandgap (eV)",
    # Tight look & feel
    height=400, width=600,
    bargap=0.0,
    hovermode="closest",
    showlegend=False,
    margin=dict(l=70, r=20, t=20, b=60)
)
fig.update_traces(mode='markers', marker_line_width=0.5, marker_size=7, marker_line_color='white')
fig.show(renderer="notebook")
# Investigate the evolution of bandgaps over the years

import plotly.express as px

fig = px.scatter(df,
                 x="data.ref.publication_date",
                 y="results.properties.electronic.band_gap.0.value",
                 color="source_database",
                 labels={"data.ref.publication_date": "Publication year", "results.properties.electronic.band_gap[0].value": "Bandgap (eV)", "source_database": ""},
                 opacity=0.5,)
fig.update_layout(
    yaxis_title="Bandgap (eV)",
    # Tight look & feel
    height=400, width=600,
    bargap=0.0,
    hovermode="closest",
    showlegend=False,
    margin=dict(l=70, r=20, t=20, b=60)
)
fig.update_traces(mode='markers', marker_line_width=0.5, marker_size=7, marker_line_color='white')
fig.show(renderer="notebook")

Density Plot Analysis¶

The following contour plot with marginal histograms provides a detailed view of bandgap distribution evolution over time.

In [5]:

Copied!





import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Filter to 1.2–2.2 eV range
df_bandgap = df[
    (df["results.properties.electronic.band_gap.0.value"] >= 1.2) &
    (df["results.properties.electronic.band_gap.0.value"] <= 2.2)
].copy()

# Print the number of entries in df_bandgap
print(f"Number of entries in df_bandgap: {len(df_bandgap)}")

# Filter from 2014 onwards
pub_date = pd.to_datetime(df_bandgap["data.ref.publication_date"], errors="coerce")
df_bandgap = df_bandgap[pub_date.dt.year >= 2014]

# Short aliases for columns
xcol = "data.ref.publication_date"
ycol = "results.properties.electronic.band_gap.0.value"

# If publication_date is datetime, Plotly handles it; if it's a string year, you can cast to int:
# df_bandgap[xcol] = pd.to_datetime(df_bandgap[xcol], errors="coerce")

fig = make_subplots(
    rows=2, cols=2,
    column_widths=[0.85, 0.15],
    row_heights=[0.15, 0.85],
    specs=[[{"type": "xy"}, {"type": "histogram"}],
           [{"type": "xy"}, {"type": "histogram"}]],
    shared_xaxes=True,
    shared_yaxes=True,
    horizontal_spacing=0.02,
    vertical_spacing=0.02
)

# Main filled 2D contour
fig.add_trace(
    go.Histogram2dContour(
        x=df_bandgap[xcol],
        y=df_bandgap[ycol],
        contours_coloring="fill",
        colorscale="Blues_r",
        reversescale=True,
        showscale=True,
        ncontours=15,
        zauto=False,   # Turn off auto range
        zmin=1,        # Lower clamp
        zmax=200,
    ),
    row=2, col=1
)

# Overlay scatter points
fig.add_trace(
    go.Scattergl(
        x=df_bandgap[xcol],
        y=df_bandgap[ycol],
        mode="markers",
        marker=dict(color="rgba(0,0,0,0.05)", size=4),
        hovertemplate="Year: %{x}<br>Bandgap: %{y:.3f} eV<extra></extra>"
    ),
    row=2, col=1
)

# Top histogram (x / publication year)
fig.add_trace(
    go.Histogram(
        x=df_bandgap[xcol],
        nbinsx=40,
        marker=dict(color="rgba(0,0,0,0.8)"),
        showlegend=False
    ),
    row=1, col=1
)

# Right histogram (y / bandgap)
fig.add_trace(
    go.Histogram(
        y=df_bandgap[ycol],
        nbinsy=40,
        marker=dict(color="rgba(0,0,0,0.8)"),
        showlegend=False
    ),
    row=2, col=2
)

# Axes + layout
fig.update_xaxes(
    title_text="Publication year",
    row=2, col=1,
    showgrid=False, zeroline=False,
)
fig.update_yaxes(
    title_text="Bandgap / eV",
    row=2, col=1,
    showgrid=False, zeroline=False,
    range=[1.42, 1.72]
)

# Hide redundant axes labels on the marginal plots
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_yaxes(showticklabels=False, row=2, col=2)

# Tight look & feel
fig.update_layout(
    height=600, width=750,
    bargap=0.0,
    hovermode="closest",
    showlegend=False,
    margin=dict(l=70, r=20, t=20, b=60)
)

fig.show(renderer="notebook")
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Filter to 1.2–2.2 eV range
df_bandgap = df[
    (df["results.properties.electronic.band_gap.0.value"] >= 1.2) &
    (df["results.properties.electronic.band_gap.0.value"] <= 2.2)
].copy()

# Print the number of entries in df_bandgap
print(f"Number of entries in df_bandgap: {len(df_bandgap)}")

# Filter from 2014 onwards
pub_date = pd.to_datetime(df_bandgap["data.ref.publication_date"], errors="coerce")
df_bandgap = df_bandgap[pub_date.dt.year >= 2014]

# Short aliases for columns
xcol = "data.ref.publication_date"
ycol = "results.properties.electronic.band_gap.0.value"

# If publication_date is datetime, Plotly handles it; if it's a string year, you can cast to int:
# df_bandgap[xcol] = pd.to_datetime(df_bandgap[xcol], errors="coerce")

fig = make_subplots(
    rows=2, cols=2,
    column_widths=[0.85, 0.15],
    row_heights=[0.15, 0.85],
    specs=[[{"type": "xy"}, {"type": "histogram"}],
           [{"type": "xy"}, {"type": "histogram"}]],
    shared_xaxes=True,
    shared_yaxes=True,
    horizontal_spacing=0.02,
    vertical_spacing=0.02
)

# Main filled 2D contour
fig.add_trace(
    go.Histogram2dContour(
        x=df_bandgap[xcol],
        y=df_bandgap[ycol],
        contours_coloring="fill",
        colorscale="Blues_r",
        reversescale=True,
        showscale=True,
        ncontours=15,
        zauto=False,   # Turn off auto range
        zmin=1,        # Lower clamp
        zmax=200,
    ),
    row=2, col=1
)

# Overlay scatter points
fig.add_trace(
    go.Scattergl(
        x=df_bandgap[xcol],
        y=df_bandgap[ycol],
        mode="markers",
        marker=dict(color="rgba(0,0,0,0.05)", size=4),
        hovertemplate="Year: %{x}
Bandgap: %{y:.3f} eV"
    ),
    row=2, col=1
)

# Top histogram (x / publication year)
fig.add_trace(
    go.Histogram(
        x=df_bandgap[xcol],
        nbinsx=40,
        marker=dict(color="rgba(0,0,0,0.8)"),
        showlegend=False
    ),
    row=1, col=1
)

# Right histogram (y / bandgap)
fig.add_trace(
    go.Histogram(
        y=df_bandgap[ycol],
        nbinsy=40,
        marker=dict(color="rgba(0,0,0,0.8)"),
        showlegend=False
    ),
    row=2, col=2
)

# Axes + layout
fig.update_xaxes(
    title_text="Publication year",
    row=2, col=1,
    showgrid=False, zeroline=False,
)
fig.update_yaxes(
    title_text="Bandgap / eV",
    row=2, col=1,
    showgrid=False, zeroline=False,
    range=[1.42, 1.72]
)

# Hide redundant axes labels on the marginal plots
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_yaxes(showticklabels=False, row=2, col=2)

# Tight look & feel
fig.update_layout(
    height=600, width=750,
    bargap=0.0,
    hovermode="closest",
    showlegend=False,
    margin=dict(l=70, r=20, t=20, b=60)
)

fig.show(renderer="notebook")

Number of entries in df_bandgap: 36788

Compositional Analysis¶

The density plot reveals a clear trend toward bandgap values of approximately 1.55 eV since 2021. To understand the underlying compositional changes, we analyze the most common pure iodide perovskite compositions: MAPbI, FAPbI, CsFAPbI, CsMAFAPbI, and FAMAPbI.

In [6]:

Copied!





# Plot composition vs bandgap using violin plots for the most common compositions

df_composition_bandgap = df.dropna(subset=['data.perovskite.composition_short_form', 'results.properties.electronic.band_gap.0.value'])
df_composition_bandgap = df_composition_bandgap[df_composition_bandgap['results.properties.electronic.band_gap.0.value'] <= 3.0]

import plotly.express as px

# Plot only data for MAPbI, FAPbI, CsFAPbI, CsMAFAPbI, FAMAPbI
allowed_compositions = ['MAPbI', 'CsFAMAPbI', 'CsFAPbI',  'FAMAPbI', 'FAPbI',]

fig = px.violin(df_composition_bandgap[df_composition_bandgap['data.perovskite.composition_short_form'].isin(allowed_compositions)],
                 x="data.perovskite.composition_short_form",
                 y="results.properties.electronic.band_gap.0.value",
                 color="data.perovskite.composition_short_form",
                 box=True, points="all",
                 labels={"data.perovskite.composition_short_form": "Perovskite composition", "results.properties.electronic.band_gap.0.value": "Bandgap / eV", "data.perovskite.composition_short_form": ""},
                 category_orders={"data.perovskite.composition_short_form": allowed_compositions}
                )
fig.update_traces(marker=dict(opacity=0.3, size=4))
fig.update_layout(
    yaxis_title="Bandgap / eV",
    height=400, width=700,
    yaxis=dict(range=[1.4, 1.7]),
    showlegend=False,
)
fig.show(renderer="notebook")

# Print the number of entries in df_composition_bandgap with allowed compositions by source and composition

for source in df_composition_bandgap['source_database'].unique():
    df_source = df_composition_bandgap[df_composition_bandgap['source_database'] == source]
    print(f"Source: {source}")
    for composition in allowed_compositions:
        count = len(df_source[df_source['data.perovskite.composition_short_form'] == composition])
        print(f"  Composition: {composition}, Count: {count}")
    total_count = len(df_source[df_source['data.perovskite.composition_short_form'].isin(allowed_compositions)])
    print(f"  Total entries with allowed compositions: {total_count}\n")

# Print the total number of entries with allowed compositions regardless of source_database

print(f"Total entries with allowed compositions regardless of source_database: {len(df_composition_bandgap[df_composition_bandgap['data.perovskite.composition_short_form'].isin(allowed_compositions)])}")
# Plot composition vs bandgap using violin plots for the most common compositions

df_composition_bandgap = df.dropna(subset=['data.perovskite.composition_short_form', 'results.properties.electronic.band_gap.0.value'])
df_composition_bandgap = df_composition_bandgap[df_composition_bandgap['results.properties.electronic.band_gap.0.value'] <= 3.0]

import plotly.express as px

# Plot only data for MAPbI, FAPbI, CsFAPbI, CsMAFAPbI, FAMAPbI
allowed_compositions = ['MAPbI', 'CsFAMAPbI', 'CsFAPbI',  'FAMAPbI', 'FAPbI',]

fig = px.violin(df_composition_bandgap[df_composition_bandgap['data.perovskite.composition_short_form'].isin(allowed_compositions)],
                 x="data.perovskite.composition_short_form",
                 y="results.properties.electronic.band_gap.0.value",
                 color="data.perovskite.composition_short_form",
                 box=True, points="all",
                 labels={"data.perovskite.composition_short_form": "Perovskite composition", "results.properties.electronic.band_gap.0.value": "Bandgap / eV", "data.perovskite.composition_short_form": ""},
                 category_orders={"data.perovskite.composition_short_form": allowed_compositions}
                )
fig.update_traces(marker=dict(opacity=0.3, size=4))
fig.update_layout(
    yaxis_title="Bandgap / eV",
    height=400, width=700,
    yaxis=dict(range=[1.4, 1.7]),
    showlegend=False,
)
fig.show(renderer="notebook")

# Print the number of entries in df_composition_bandgap with allowed compositions by source and composition

for source in df_composition_bandgap['source_database'].unique():
    df_source = df_composition_bandgap[df_composition_bandgap['source_database'] == source]
    print(f"Source: {source}")
    for composition in allowed_compositions:
        count = len(df_source[df_source['data.perovskite.composition_short_form'] == composition])
        print(f"  Composition: {composition}, Count: {count}")
    total_count = len(df_source[df_source['data.perovskite.composition_short_form'].isin(allowed_compositions)])
    print(f"  Total entries with allowed compositions: {total_count}\n")

# Print the total number of entries with allowed compositions regardless of source_database

print(f"Total entries with allowed compositions regardless of source_database: {len(df_composition_bandgap[df_composition_bandgap['data.perovskite.composition_short_form'].isin(allowed_compositions)])}")

Source: Manual Entry
  Composition: MAPbI, Count: 26342
  Composition: CsFAMAPbI, Count: 36
  Composition: CsFAPbI, Count: 111
  Composition: FAMAPbI, Count: 260
  Composition: FAPbI, Count: 565
  Total entries with allowed compositions: 27314

Source: LLM Extracted
  Composition: MAPbI, Count: 933
  Composition: CsFAMAPbI, Count: 102
  Composition: CsFAPbI, Count: 253
  Composition: FAMAPbI, Count: 184
  Composition: FAPbI, Count: 587
  Total entries with allowed compositions: 2059

Total entries with allowed compositions regardless of source_database: 29373

Temporal Compositional Shifts¶

The median bandgaps for CsFAPbI, CsMAFAPbI, and FAMAPbI are centered around 1.55 eV. To quantify the temporal evolution of compositional preferences, we compare the distribution of perovskite compositions before and after 2022, when comprehensive database integration expanded significantly.

In [7]:

Copied!





import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# -----------------------------
# Data preparation
# -----------------------------
df = df_composition_bandgap.copy()

df["publication_year"] = pd.to_datetime(
    df["data.ref.publication_date"],
    errors="coerce"
).dt.year
df = df.dropna(subset=["publication_year"])

df_early = df[df["publication_year"] <= 2021]
df_late  = df[df["publication_year"] >= 2022]

allowed = ["MAPbI", "CsFAMAPbI", "CsFAPbI", "FAMAPbI", "FAPbI"]
col = "data.perovskite.composition_short_form"
df_early = df_early[df_early[col].isin(allowed)]
df_late  = df_late[df_late[col].isin(allowed)]

# -----------------------------
# Helpers
# -----------------------------
def counts_with_other(series, min_pct_for_other=None):
    counts = series.value_counts()
    total = int(counts.sum())
    if total == 0:
        return pd.Series(dtype=int), 0

    if min_pct_for_other is not None:
        keep = counts[(counts / total * 100) >= min_pct_for_other]
        drop = counts[(counts / total * 100) <  min_pct_for_other]
        if len(drop) > 0:
            counts = pd.concat([keep, pd.Series({"Other": int(drop.sum())})])
            counts = counts.reindex([*keep.index.tolist(), "Other"])
    return counts.astype(int), total

def prep_text(counts, total, inside_pct=10.0, force_inside_label=None):
    labels = counts.index.tolist()
    values = counts.values.astype(int).tolist()
    pcts = [v * 100.0 / total for v in values]

    text, textpos, pull = [], [], []
    for lab, v, p in zip(labels, values, pcts):
        if force_inside_label is not None and lab == force_inside_label:
            text.append(f"{lab}<br> ({p:.1f}%)")
            textpos.append("inside")
        elif p >= inside_pct:
            text.append(f"{lab}<br> ({p:.1f}%)")
            textpos.append("inside")
        else:
            text.append(f"{lab} ({p:.1f}%)")
            textpos.append("outside")

        pull.append(0.03 if p < 7 else 0.0)

    return labels, values, text, textpos, pull

# Left side: group tiny categories into Other and DO NOT rely on inside text
counts1, total1 = counts_with_other(df_early[col], min_pct_for_other=1.0)
l1, v1, t1, tp1, pull1 = prep_text(counts1, total1, inside_pct=10.0)

# Right side: normal behavior is fine
counts2, total2 = counts_with_other(df_late[col], min_pct_for_other=None)
l2, v2, t2, tp2, pull2 = prep_text(counts2, total2, inside_pct=10.0)

# Compute the MAPbI share on the left for a manual annotation (guaranteed visible)
mapbi_left = int(counts1.get("MAPbI", 0))
mapbi_left_pct = (mapbi_left / total1 * 100.0) if total1 else 0.0

# -----------------------------
# Figure
# -----------------------------
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "domain"}, {"type": "domain"}]],
    horizontal_spacing=0.20
)

# Left donut
fig.add_trace(
    go.Pie(
        labels=l1, values=v1, hole=0.33,
        text=t1, textinfo="text", textposition=tp1, pull=pull1,
        sort=False, direction="clockwise",
        domain=dict(x=[0.00, 0.44]),
        automargin=True,
        hovertemplate="%{label}<br>%{value} (%{percent})<extra></extra>",
    ),
    1, 1
)

# Right donut
fig.add_trace(
    go.Pie(
        labels=l2, values=v2, hole=0.33,
        text=t2, textinfo="text", textposition=tp2, pull=pull2,
        sort=False, direction="clockwise",
        domain=dict(x=[0.56, 1.00]),
        automargin=True,
        hovertemplate="%{label}<br>%{value} (%{percent})<extra></extra>",
    ),
    1, 2
)

# -----------------------------
# Titles (panel labels)
# -----------------------------
fig.add_annotation(
    text="<b>Until 2021</b>",
    x=0.22, y=1.08, xref="paper", yref="paper",
    showarrow=False, font=dict(size=26)
)
fig.add_annotation(
    text="<b>From 2022</b>",
    x=1.02, y=1.08, xref="paper", yref="paper",
    showarrow=False, font=dict(size=26)
)


# -----------------------------
# Styling for publication
# -----------------------------
fig.update_layout(
    height=400,
    width=700,
    showlegend=False,
    margin=dict(l=30, r=30, t=70, b=30),
    font=dict(size=18),
    # Keep overlap protection for outside labels; MAPbI is now manual so it won't disappear
    uniformtext=dict(
        minsize=14,
        #mode="hide"
        ),
)

fig.update_traces(
    marker_line_width=1.2,
    marker_line_color="white",
    outsidetextfont=dict(size=18),
    insidetextfont=dict(size=28),
    insidetextorientation="radial",
)

fig.show(renderer="notebook")

# print the number of entries in each split df_early and df_late

print(f"Number of entries in df_early: {len(df_early)}")
print(f"Number of entries in df_late: {len(df_late)}")
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# -----------------------------
# Data preparation
# -----------------------------
df = df_composition_bandgap.copy()

df["publication_year"] = pd.to_datetime(
    df["data.ref.publication_date"],
    errors="coerce"
).dt.year
df = df.dropna(subset=["publication_year"])

df_early = df[df["publication_year"] <= 2021]
df_late  = df[df["publication_year"] >= 2022]

allowed = ["MAPbI", "CsFAMAPbI", "CsFAPbI", "FAMAPbI", "FAPbI"]
col = "data.perovskite.composition_short_form"
df_early = df_early[df_early[col].isin(allowed)]
df_late  = df_late[df_late[col].isin(allowed)]

# -----------------------------
# Helpers
# -----------------------------
def counts_with_other(series, min_pct_for_other=None):
    counts = series.value_counts()
    total = int(counts.sum())
    if total == 0:
        return pd.Series(dtype=int), 0

    if min_pct_for_other is not None:
        keep = counts[(counts / total * 100) >= min_pct_for_other]
        drop = counts[(counts / total * 100) <  min_pct_for_other]
        if len(drop) > 0:
            counts = pd.concat([keep, pd.Series({"Other": int(drop.sum())})])
            counts = counts.reindex([*keep.index.tolist(), "Other"])
    return counts.astype(int), total

def prep_text(counts, total, inside_pct=10.0, force_inside_label=None):
    labels = counts.index.tolist()
    values = counts.values.astype(int).tolist()
    pcts = [v * 100.0 / total for v in values]

    text, textpos, pull = [], [], []
    for lab, v, p in zip(labels, values, pcts):
        if force_inside_label is not None and lab == force_inside_label:
            text.append(f"{lab}
 ({p:.1f}%)")
            textpos.append("inside")
        elif p >= inside_pct:
            text.append(f"{lab}
 ({p:.1f}%)")
            textpos.append("inside")
        else:
            text.append(f"{lab} ({p:.1f}%)")
            textpos.append("outside")

        pull.append(0.03 if p < 7 else 0.0)

    return labels, values, text, textpos, pull

# Left side: group tiny categories into Other and DO NOT rely on inside text
counts1, total1 = counts_with_other(df_early[col], min_pct_for_other=1.0)
l1, v1, t1, tp1, pull1 = prep_text(counts1, total1, inside_pct=10.0)

# Right side: normal behavior is fine
counts2, total2 = counts_with_other(df_late[col], min_pct_for_other=None)
l2, v2, t2, tp2, pull2 = prep_text(counts2, total2, inside_pct=10.0)

# Compute the MAPbI share on the left for a manual annotation (guaranteed visible)
mapbi_left = int(counts1.get("MAPbI", 0))
mapbi_left_pct = (mapbi_left / total1 * 100.0) if total1 else 0.0

# -----------------------------
# Figure
# -----------------------------
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "domain"}, {"type": "domain"}]],
    horizontal_spacing=0.20
)

# Left donut
fig.add_trace(
    go.Pie(
        labels=l1, values=v1, hole=0.33,
        text=t1, textinfo="text", textposition=tp1, pull=pull1,
        sort=False, direction="clockwise",
        domain=dict(x=[0.00, 0.44]),
        automargin=True,
        hovertemplate="%{label}
%{value} (%{percent})",
    ),
    1, 1
)

# Right donut
fig.add_trace(
    go.Pie(
        labels=l2, values=v2, hole=0.33,
        text=t2, textinfo="text", textposition=tp2, pull=pull2,
        sort=False, direction="clockwise",
        domain=dict(x=[0.56, 1.00]),
        automargin=True,
        hovertemplate="%{label}
%{value} (%{percent})",
    ),
    1, 2
)

# -----------------------------
# Titles (panel labels)
# -----------------------------
fig.add_annotation(
    text="Until 2021",
    x=0.22, y=1.08, xref="paper", yref="paper",
    showarrow=False, font=dict(size=26)
)
fig.add_annotation(
    text="From 2022",
    x=1.02, y=1.08, xref="paper", yref="paper",
    showarrow=False, font=dict(size=26)
)


# -----------------------------
# Styling for publication
# -----------------------------
fig.update_layout(
    height=400,
    width=700,
    showlegend=False,
    margin=dict(l=30, r=30, t=70, b=30),
    font=dict(size=18),
    # Keep overlap protection for outside labels; MAPbI is now manual so it won't disappear
    uniformtext=dict(
        minsize=14,
        #mode="hide"
        ),
)

fig.update_traces(
    marker_line_width=1.2,
    marker_line_color="white",
    outsidetextfont=dict(size=18),
    insidetextfont=dict(size=28),
    insidetextorientation="radial",
)

fig.show(renderer="notebook")

# print the number of entries in each split df_early and df_late

print(f"Number of entries in df_early: {len(df_early)}")
print(f"Number of entries in df_late: {len(df_late)}")

Number of entries in df_early: 27593
Number of entries in df_late: 1777