# ruff: noqa: E402
Performance Evolution Analysis of Perovskite Solar Cells
This notebook analyses some aspects of the performance evolution in time in the field with the data from the Perovskite database in NOMAD.
from plotly_theme import register_template, set_defaults
register_template()
set_defaults()
# Load the data from the parquet file into a DataFrame
import pandas as pd
df = pd.read_parquet('perovskite_solar_cell_database.parquet')
# Set a source_database column: if name_of_person_entering_the_data is 'LLM Extraction', use 'LLM Extracted', else 'Manual Entry'
df['source_database'] = df['data.ref.name_of_person_entering_the_data'].apply(
lambda x: 'LLM Extracted' if x == 'LLM Extraction' else 'Manual Entry'
)
Efficiency evolution over time¶
We start by inspecting how reported power-conversion efficiency has progressed over the publication year. The link below opens the filtered view in the NOMAD dashboard. You can use the scatter plot widget to look at the data from the figure directly inside NOMAD.
Dashboard YAML
- type: scatter_plot
autorange: true
size: 10000
y:
search_quantity: results.properties.optoelectronic.solar_cell.efficiency
x:
search_quantity: data.ref.publication_date#perovskite_solar_cell_database.schema.PerovskiteSolarCell
layout:
xxl:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: Infinity
xl:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: 0
lg:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: Infinity
md:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: Infinity
sm:
minH: 3
minW: 3
h: 6
w: 9
y: 0
x: Infinity
# let's [;pt efficiency vs publication yera color coded by the source database]
import plotly.express as px
fig = px.scatter(
df,
x='data.ref.publication_date',
y='results.properties.optoelectronic.solar_cell.efficiency',
color='source_database',
labels={
'data.ref.publication_date': 'Publication year',
'results.properties.optoelectronic.solar_cell.efficiency': 'Efficiency / %',
'source_database': '',
},
opacity=0.5,
)
fig.update_layout(
yaxis_title='Efficiency / %',
height=400,
width=700,
)
# x axis from year 2012 to 2026. Note the datetime format
fig.update_xaxes(range=['2012-01-01', '2026-12-31'])
fig.update_traces(
mode='markers', marker_line_width=0.5, marker_size=7, marker_line_color='white'
)
fig.show(renderer="notebook")
The newly extracted data show that the field keeps progressing steadily! 🚀 📈
Recombination loss evolution¶
Next, we analyze the normalized reduction in recombination losses across the literature. Because this requires transforming the raw quantities, the plot cannot be reproduced directly inside the Perovskite Database dashboard, but the full workflow is captured here.
We:
- retrieve the perovskite band gap (Eg) from the archive and convert it from joules to electronvolts,
- estimate the Shockley–Queisser limited open-circuit voltage (VOC,SQ) using the phenomenological relation valid for AM1.5G,
- compare the measured VOC against VOC,SQ to quantify the normalized loss.
For reference (see SI in this paper), the SQ expression used here is:
VOC,SQ = -0.167 + 0.932 · Eg
Several Python implementations are available if you want to explore the SQ limit further, for example: https://github.com/sidihamady/Shockley-Queisser
# lets convert the bandgap values to eV from J
df['results.properties.electronic.band_gap.0.value'] = (
df['results.properties.electronic.band_gap.0.value'] / 1.60218e-19
)
df['sqvoc'] = -0.167 + 0.932 * df['results.properties.electronic.band_gap.0.value']
# let's drop rows with missing values in any of the relevant columns
df.dropna(
subset=[
'results.properties.optoelectronic.solar_cell.open_circuit_voltage',
'data.ref.publication_date',
'results.properties.electronic.band_gap.0.value',
'sqvoc',
],
inplace=True,
)
# remove rows where the diff is negative
df = df[
df['results.properties.optoelectronic.solar_cell.open_circuit_voltage']
<= df['sqvoc']
]
# lets get some statistic of the yearly reported values for the sqvoc - voc, do the difference per row first and then do the stats
df['diff'] = (
df['sqvoc']
- df['results.properties.optoelectronic.solar_cell.open_circuit_voltage']
)
# lets get now the stats per year
df.groupby(df['data.ref.publication_date'].str[:4])['diff'].describe()
# exclude data before 2013
df = df[df['data.ref.publication_date'].str[:4].astype(int) >= 2013]
# lets plot the mean and std dev per year with plotly
import plotly.express as px
fig = px.scatter(
df.groupby(df['data.ref.publication_date'].str[:4])['diff']
.describe()
.reset_index(),
x='data.ref.publication_date',
y='mean',
error_y='std',
labels={
'data.ref.publication_date': 'Year',
'mean': 'Mean of <i>V</i><sub>OC</sub><sup>SQ</sup> − <i>V</i><sub>OC</sub>',
'std': 'Standard Deviation',
},
)
fig.update_layout(
yaxis_title='Yearly mean of <i>V</i><sub>OC</sub><sup>SQ</sup> − <i>V</i><sub>OC</sub> / V'
)
fig.update_traces(mode='markers', marker_line_width=0.5, marker_size=12)
# add a fitted line to the plot and label the average decay per year. Do the fit first
import numpy as np
from scipy import stats
x = (
df.groupby(df['data.ref.publication_date'].str[:4])['diff']
.describe()
.reset_index()['data.ref.publication_date']
.astype(int)
)
y = (
df.groupby(df['data.ref.publication_date'].str[:4])['diff']
.describe()
.reset_index()['mean']
)
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
print(f'Slope: {slope}, Intercept: {intercept}, R-squared: {r_value**2}')
fig.add_traces(
px.line(x=x, y=intercept + slope * x, labels={'x': 'Year', 'y': 'Fitted line'})
.update_traces(line_color='#ff0e5a')
.data
)
fig.update_layout(
height=400,
width=700,
)
fig.show(renderer="notebook")
Slope: -0.02185503259282026, Intercept: 44.47451936484294, R-squared: 0.9517990118698644
To highlight the spread in recombination performance over time, we also render violin plots per publication year. This view complements the summary statistics and makes outliers easy to spot.
import plotly.express as px
fig = px.violin(
df,
x=df['data.ref.publication_date'].str[:4],
y='diff',
box=True,
points='all',
labels={
'x': 'Publication year',
'diff': '<i>V</i><sub>OC</sub><sup>SQ</sup> − <i>V</i><sub>OC</sub> / V',
},
)
fig.update_traces(marker=dict(opacity=0.1))
# add a fitted trend line in red
import plotly.graph_objects as go
years = df['data.ref.publication_date'].str[:4].astype(int).unique()
years.sort()
mean_diff = (
df.groupby(df['data.ref.publication_date'].str[:4])['diff']
.mean()
.reindex(years.astype(str))
.values
)
meadian_diff = (
df.groupby(df['data.ref.publication_date'].str[:4])['diff']
.median()
.reindex(years.astype(str))
.values
)
# Fit a linear trend line
coeffs = np.polyfit(years, meadian_diff, 1)
trend_line = np.polyval(coeffs, years)
# calculate the slope and intercept
slope = coeffs[0]
intercept = coeffs[1]
# print r-squared value
correlation_matrix = np.corrcoef(years, meadian_diff)
correlation_xy = correlation_matrix[0, 1]
r_squared = correlation_xy**2
print(f'R-squared: {r_squared}')
fig.add_trace(
go.Scatter(
x=years,
y=trend_line,
mode='lines',
line=dict(color='#ff0e5a'),
name=f'{slope:.3f} V / year',
)
)
fig.update_layout(
height=400,
width=700,
)
fig.show(renderer="notebook")
R-squared: 0.9751363550492053
That completes this brief exploration. The near-linear efficiency gains emphasize how steadily the field has progressed, even as new transport layers and novel characterization techniques such as absolute photoluminescence have accelerated loss analysis.