# ruff: noqa: E402, F601
Temporal Evolution of Bandgap and Absorber Composition in Perovskite Solar Cells
This notebook analyzes the temporal evolution of optical bandgap and absorber composition in perovskite solar cells using data from the Perovskite Database in NOMAD.
from plotly_theme import register_template, set_defaults
register_template()
set_defaults()
Setup and Data Loading¶
# Load the data from the parquet file into a DataFrame
import pandas as pd
df = pd.read_parquet("perovskite_solar_cell_database.parquet")
# Set the source_database column: if data.ref.person_entering_data is 'LLM Extraction', then 'LLM Extracted', else 'Manual Entry'
df['source_database'] = df['data.ref.name_of_person_entering_the_data'].apply(lambda x: 'LLM Extracted' if x == 'LLM Extraction' else 'Manual Entry')
# Convert band gap from Joules to eV (1 eV = 1.60218e-19 J)
df["results.properties.electronic.band_gap.0.value"] = df["results.properties.electronic.band_gap.0.value"].apply(lambda x: x / 1.60218e-19 if pd.notnull(x) else x)
Bandgap Evolution Over Time¶
The reported bandgap values have evolved systematically over publication years. The link below opens a filtered view in the NOMAD dashboard. You can use the scatter plot widget to view the data from the figure directly in NOMAD. Below is a YAML snippet that you can use to reproduce a helpful dashboard to explore the influence of different cation compositions.
Dashboard YAML
- type: scatter_plot
autorange: true
size: 10000
markers:
color:
search_quantity: data.perovskite.composition_a_ions#perovskite_solar_cell_database.schema.PerovskiteSolarCell
y:
search_quantity: results.properties.electronic.band_gap[0].value
title: Bandgap
x:
search_quantity: data.ref.publication_date#perovskite_solar_cell_database.schema.PerovskiteSolarCell
layout:
xxl:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: Infinity
xl:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: 0
lg:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: 0
md:
minH: 3
minW: 3
h: 6
w: 12
y: 0
x: 0
sm:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: 0
- type: terms
show_input: false
scale: linear
search_quantity: data.perovskite.composition_a_ions#perovskite_solar_cell_database.schema.PerovskiteSolarCell
layout:
xxl:
minH: 3
minW: 3
h: 9
w: 6
y: 0
x: Infinity
xl:
minH: 3
minW: 3
h: 9
w: 6
y: 0
x: 9
lg:
minH: 3
minW: 3
h: 9
w: 6
y: 0
x: 9
md:
minH: 3
minW: 3
h: 6
w: 5
y: 0
x: 12
sm:
minH: 3
minW: 3
h: 9
w: 6
y: 6
x: 0
# Investigate the evolution of bandgaps over the years
import plotly.express as px
fig = px.scatter(df,
x="data.ref.publication_date",
y="results.properties.electronic.band_gap.0.value",
color="source_database",
labels={"data.ref.publication_date": "Publication year", "results.properties.electronic.band_gap[0].value": "Bandgap (eV)", "source_database": ""},
opacity=0.5,)
fig.update_layout(
yaxis_title="Bandgap (eV)",
# Tight look & feel
height=400, width=600,
bargap=0.0,
hovermode="closest",
showlegend=False,
margin=dict(l=70, r=20, t=20, b=60)
)
fig.update_traces(mode='markers', marker_line_width=0.5, marker_size=7, marker_line_color='white')
fig.show(renderer="notebook")
Density Plot Analysis¶
The following contour plot with marginal histograms provides a detailed view of bandgap distribution evolution over time.
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Filter to 1.2–2.2 eV range
df_bandgap = df[
(df["results.properties.electronic.band_gap.0.value"] >= 1.2) &
(df["results.properties.electronic.band_gap.0.value"] <= 2.2)
].copy()
# Print the number of entries in df_bandgap
print(f"Number of entries in df_bandgap: {len(df_bandgap)}")
# Filter from 2014 onwards
pub_date = pd.to_datetime(df_bandgap["data.ref.publication_date"], errors="coerce")
df_bandgap = df_bandgap[pub_date.dt.year >= 2014]
# Short aliases for columns
xcol = "data.ref.publication_date"
ycol = "results.properties.electronic.band_gap.0.value"
# If publication_date is datetime, Plotly handles it; if it's a string year, you can cast to int:
# df_bandgap[xcol] = pd.to_datetime(df_bandgap[xcol], errors="coerce")
fig = make_subplots(
rows=2, cols=2,
column_widths=[0.85, 0.15],
row_heights=[0.15, 0.85],
specs=[[{"type": "xy"}, {"type": "histogram"}],
[{"type": "xy"}, {"type": "histogram"}]],
shared_xaxes=True,
shared_yaxes=True,
horizontal_spacing=0.02,
vertical_spacing=0.02
)
# Main filled 2D contour
fig.add_trace(
go.Histogram2dContour(
x=df_bandgap[xcol],
y=df_bandgap[ycol],
contours_coloring="fill",
colorscale="Blues_r",
reversescale=True,
showscale=True,
ncontours=15,
zauto=False, # Turn off auto range
zmin=1, # Lower clamp
zmax=200,
),
row=2, col=1
)
# Overlay scatter points
fig.add_trace(
go.Scattergl(
x=df_bandgap[xcol],
y=df_bandgap[ycol],
mode="markers",
marker=dict(color="rgba(0,0,0,0.05)", size=4),
hovertemplate="Year: %{x}<br>Bandgap: %{y:.3f} eV<extra></extra>"
),
row=2, col=1
)
# Top histogram (x / publication year)
fig.add_trace(
go.Histogram(
x=df_bandgap[xcol],
nbinsx=40,
marker=dict(color="rgba(0,0,0,0.8)"),
showlegend=False
),
row=1, col=1
)
# Right histogram (y / bandgap)
fig.add_trace(
go.Histogram(
y=df_bandgap[ycol],
nbinsy=40,
marker=dict(color="rgba(0,0,0,0.8)"),
showlegend=False
),
row=2, col=2
)
# Axes + layout
fig.update_xaxes(
title_text="Publication year",
row=2, col=1,
showgrid=False, zeroline=False,
)
fig.update_yaxes(
title_text="Bandgap / eV",
row=2, col=1,
showgrid=False, zeroline=False,
range=[1.42, 1.72]
)
# Hide redundant axes labels on the marginal plots
fig.update_xaxes(showticklabels=False, row=1, col=1)
fig.update_yaxes(showticklabels=False, row=2, col=2)
# Tight look & feel
fig.update_layout(
height=600, width=750,
bargap=0.0,
hovermode="closest",
showlegend=False,
margin=dict(l=70, r=20, t=20, b=60)
)
fig.show(renderer="notebook")
Bandgap: %{y:.3f} eV
Number of entries in df_bandgap: 36788
Compositional Analysis¶
The density plot reveals a clear trend toward bandgap values of approximately 1.55 eV since 2021. To understand the underlying compositional changes, we analyze the most common pure iodide perovskite compositions: MAPbI, FAPbI, CsFAPbI, CsMAFAPbI, and FAMAPbI.
# Plot composition vs bandgap using violin plots for the most common compositions
df_composition_bandgap = df.dropna(subset=['data.perovskite.composition_short_form', 'results.properties.electronic.band_gap.0.value'])
df_composition_bandgap = df_composition_bandgap[df_composition_bandgap['results.properties.electronic.band_gap.0.value'] <= 3.0]
import plotly.express as px
# Plot only data for MAPbI, FAPbI, CsFAPbI, CsMAFAPbI, FAMAPbI
allowed_compositions = ['MAPbI', 'CsFAMAPbI', 'CsFAPbI', 'FAMAPbI', 'FAPbI',]
fig = px.violin(df_composition_bandgap[df_composition_bandgap['data.perovskite.composition_short_form'].isin(allowed_compositions)],
x="data.perovskite.composition_short_form",
y="results.properties.electronic.band_gap.0.value",
color="data.perovskite.composition_short_form",
box=True, points="all",
labels={"data.perovskite.composition_short_form": "Perovskite composition", "results.properties.electronic.band_gap.0.value": "Bandgap / eV", "data.perovskite.composition_short_form": ""},
category_orders={"data.perovskite.composition_short_form": allowed_compositions}
)
fig.update_traces(marker=dict(opacity=0.3, size=4))
fig.update_layout(
yaxis_title="Bandgap / eV",
height=400, width=700,
yaxis=dict(range=[1.4, 1.7]),
showlegend=False,
)
fig.show(renderer="notebook")
# Print the number of entries in df_composition_bandgap with allowed compositions by source and composition
for source in df_composition_bandgap['source_database'].unique():
df_source = df_composition_bandgap[df_composition_bandgap['source_database'] == source]
print(f"Source: {source}")
for composition in allowed_compositions:
count = len(df_source[df_source['data.perovskite.composition_short_form'] == composition])
print(f" Composition: {composition}, Count: {count}")
total_count = len(df_source[df_source['data.perovskite.composition_short_form'].isin(allowed_compositions)])
print(f" Total entries with allowed compositions: {total_count}\n")
# Print the total number of entries with allowed compositions regardless of source_database
print(f"Total entries with allowed compositions regardless of source_database: {len(df_composition_bandgap[df_composition_bandgap['data.perovskite.composition_short_form'].isin(allowed_compositions)])}")
Source: Manual Entry Composition: MAPbI, Count: 26342 Composition: CsFAMAPbI, Count: 36 Composition: CsFAPbI, Count: 111 Composition: FAMAPbI, Count: 260 Composition: FAPbI, Count: 565 Total entries with allowed compositions: 27314 Source: LLM Extracted Composition: MAPbI, Count: 933 Composition: CsFAMAPbI, Count: 102 Composition: CsFAPbI, Count: 253 Composition: FAMAPbI, Count: 184 Composition: FAPbI, Count: 587 Total entries with allowed compositions: 2059 Total entries with allowed compositions regardless of source_database: 29373
Temporal Compositional Shifts¶
The median bandgaps for CsFAPbI, CsMAFAPbI, and FAMAPbI are centered around 1.55 eV. To quantify the temporal evolution of compositional preferences, we compare the distribution of perovskite compositions before and after 2022, when comprehensive database integration expanded significantly.
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# -----------------------------
# Data preparation
# -----------------------------
df = df_composition_bandgap.copy()
df["publication_year"] = pd.to_datetime(
df["data.ref.publication_date"],
errors="coerce"
).dt.year
df = df.dropna(subset=["publication_year"])
df_early = df[df["publication_year"] <= 2021]
df_late = df[df["publication_year"] >= 2022]
allowed = ["MAPbI", "CsFAMAPbI", "CsFAPbI", "FAMAPbI", "FAPbI"]
col = "data.perovskite.composition_short_form"
df_early = df_early[df_early[col].isin(allowed)]
df_late = df_late[df_late[col].isin(allowed)]
# -----------------------------
# Helpers
# -----------------------------
def counts_with_other(series, min_pct_for_other=None):
counts = series.value_counts()
total = int(counts.sum())
if total == 0:
return pd.Series(dtype=int), 0
if min_pct_for_other is not None:
keep = counts[(counts / total * 100) >= min_pct_for_other]
drop = counts[(counts / total * 100) < min_pct_for_other]
if len(drop) > 0:
counts = pd.concat([keep, pd.Series({"Other": int(drop.sum())})])
counts = counts.reindex([*keep.index.tolist(), "Other"])
return counts.astype(int), total
def prep_text(counts, total, inside_pct=10.0, force_inside_label=None):
labels = counts.index.tolist()
values = counts.values.astype(int).tolist()
pcts = [v * 100.0 / total for v in values]
text, textpos, pull = [], [], []
for lab, v, p in zip(labels, values, pcts):
if force_inside_label is not None and lab == force_inside_label:
text.append(f"{lab}<br> ({p:.1f}%)")
textpos.append("inside")
elif p >= inside_pct:
text.append(f"{lab}<br> ({p:.1f}%)")
textpos.append("inside")
else:
text.append(f"{lab} ({p:.1f}%)")
textpos.append("outside")
pull.append(0.03 if p < 7 else 0.0)
return labels, values, text, textpos, pull
# Left side: group tiny categories into Other and DO NOT rely on inside text
counts1, total1 = counts_with_other(df_early[col], min_pct_for_other=1.0)
l1, v1, t1, tp1, pull1 = prep_text(counts1, total1, inside_pct=10.0)
# Right side: normal behavior is fine
counts2, total2 = counts_with_other(df_late[col], min_pct_for_other=None)
l2, v2, t2, tp2, pull2 = prep_text(counts2, total2, inside_pct=10.0)
# Compute the MAPbI share on the left for a manual annotation (guaranteed visible)
mapbi_left = int(counts1.get("MAPbI", 0))
mapbi_left_pct = (mapbi_left / total1 * 100.0) if total1 else 0.0
# -----------------------------
# Figure
# -----------------------------
fig = make_subplots(
rows=1, cols=2,
specs=[[{"type": "domain"}, {"type": "domain"}]],
horizontal_spacing=0.20
)
# Left donut
fig.add_trace(
go.Pie(
labels=l1, values=v1, hole=0.33,
text=t1, textinfo="text", textposition=tp1, pull=pull1,
sort=False, direction="clockwise",
domain=dict(x=[0.00, 0.44]),
automargin=True,
hovertemplate="%{label}<br>%{value} (%{percent})<extra></extra>",
),
1, 1
)
# Right donut
fig.add_trace(
go.Pie(
labels=l2, values=v2, hole=0.33,
text=t2, textinfo="text", textposition=tp2, pull=pull2,
sort=False, direction="clockwise",
domain=dict(x=[0.56, 1.00]),
automargin=True,
hovertemplate="%{label}<br>%{value} (%{percent})<extra></extra>",
),
1, 2
)
# -----------------------------
# Titles (panel labels)
# -----------------------------
fig.add_annotation(
text="<b>Until 2021</b>",
x=0.22, y=1.08, xref="paper", yref="paper",
showarrow=False, font=dict(size=26)
)
fig.add_annotation(
text="<b>From 2022</b>",
x=1.02, y=1.08, xref="paper", yref="paper",
showarrow=False, font=dict(size=26)
)
# -----------------------------
# Styling for publication
# -----------------------------
fig.update_layout(
height=400,
width=700,
showlegend=False,
margin=dict(l=30, r=30, t=70, b=30),
font=dict(size=18),
# Keep overlap protection for outside labels; MAPbI is now manual so it won't disappear
uniformtext=dict(
minsize=14,
#mode="hide"
),
)
fig.update_traces(
marker_line_width=1.2,
marker_line_color="white",
outsidetextfont=dict(size=18),
insidetextfont=dict(size=28),
insidetextorientation="radial",
)
fig.show(renderer="notebook")
# print the number of entries in each split df_early and df_late
print(f"Number of entries in df_early: {len(df_early)}")
print(f"Number of entries in df_late: {len(df_late)}")
({p:.1f}%)") textpos.append("inside") elif p >= inside_pct: text.append(f"{lab}
({p:.1f}%)") textpos.append("inside") else: text.append(f"{lab} ({p:.1f}%)") textpos.append("outside") pull.append(0.03 if p < 7 else 0.0) return labels, values, text, textpos, pull # Left side: group tiny categories into Other and DO NOT rely on inside text counts1, total1 = counts_with_other(df_early[col], min_pct_for_other=1.0) l1, v1, t1, tp1, pull1 = prep_text(counts1, total1, inside_pct=10.0) # Right side: normal behavior is fine counts2, total2 = counts_with_other(df_late[col], min_pct_for_other=None) l2, v2, t2, tp2, pull2 = prep_text(counts2, total2, inside_pct=10.0) # Compute the MAPbI share on the left for a manual annotation (guaranteed visible) mapbi_left = int(counts1.get("MAPbI", 0)) mapbi_left_pct = (mapbi_left / total1 * 100.0) if total1 else 0.0 # ----------------------------- # Figure # ----------------------------- fig = make_subplots( rows=1, cols=2, specs=[[{"type": "domain"}, {"type": "domain"}]], horizontal_spacing=0.20 ) # Left donut fig.add_trace( go.Pie( labels=l1, values=v1, hole=0.33, text=t1, textinfo="text", textposition=tp1, pull=pull1, sort=False, direction="clockwise", domain=dict(x=[0.00, 0.44]), automargin=True, hovertemplate="%{label}
%{value} (%{percent})
%{value} (%{percent})
Number of entries in df_early: 27593 Number of entries in df_late: 1777