%pip install -q pydeseq2 mygene plotlyimport urllib.request, osDATA_FILE = "GSE243185_Tet1v4-featurecounts.csv.gz"if not os.path.exists(DATA_FILE): urllib.request.urlretrieve( f"https://adyprat.github.io/notebooks/{DATA_FILE}", DATA_FILE ) print(f"Downloaded {DATA_FILE}")else: print(f"{DATA_FILE} already present")Differential Expression Analysis: PWS vs WT (PyDESeq2)
This notebook demonstrates a differential gene expression analysis using PyDESeq2, a Python implementation of the DESeq2 statistical framework for RNA-seq count data.
Dataset
We use RNA-seq feature counts from GSE243185, comparing Prader-Willi Syndrome (PWS) and wild-type (WT) samples with a Tet1-inducible system. The dataset contains 9 samples across two conditions (PWS and WT) and two treatments (NT and mat3).
DESeq2 Overview
DESeq2 models raw RNA-seq counts using a negative binomial distribution and performs: 1. Size factor estimation – normalizes for differences in sequencing depth across samples 2. Dispersion estimation – models gene-wise variability (mean-variance relationship) 3. Wald test – tests for significant differences in expression between conditions
Key output columns: - baseMean: average normalized count across all samples - log2FoldChange: estimated effect size (PWS vs WT) - lfcSE: standard error of the log2 fold change - stat: Wald statistic - pvalue / padj: raw and Benjamini-Hochberg adjusted p-values
# Sources:# - PyDESeq2 docs: https://pydeseq2.readthedocs.io/ (see "A simple PyDESeq2 workflow")# - PyDESeq2 paper (Bioinformatics 2023): https://pmc.ncbi.nlm.nih.gov/articles/PMC10502239/import pandas as pdfrom pydeseq2.dds import DeseqDataSetfrom pydeseq2.ds import DeseqStats# -----------------------# Load counts (genes x samples)# -----------------------counts = pd.read_csv(DATA_FILE, index_col=0)# Ensure integers (PyDESeq2 expects raw counts)counts = counts.round().astype(int)counts.head()counts.columnsIndex(['PWS_Tet1_NT_rep1', 'WT_Tet1_NT_rep1', 'PWS_Tet1_mat3_rep1',
'PWS_Tet1_mat3_rep2', 'WT_Tet1_NT_rep3', 'PWS_Tet1_NT_rep3',
'PWS_Tet1_mat3_rep3', 'WT_Tet1_NT_rep2', 'PWS_Tet1_NT_rep2'],
dtype='object')
Subset to No-Treatment (NT) SamplesFor this analysis we focus on the untreated (NT) samples only, comparing PWS vs WT genotypes. We construct a metadata table with genotype as a categorical variable, setting WT as the reference level so that fold changes are reported as PWS relative to WT.
nt_samples = [c for c in counts.columns if "_NT_" in c]
counts_nt = counts[nt_samples]
# -----------------------
# Metadata
# -----------------------
meta_nt = pd.DataFrame(
{
"genotype": ["WT" if c.startswith("WT_") else "PWS" for c in nt_samples]
},
index=nt_samples
)
meta_nt["genotype"] = pd.Categorical(
meta_nt["genotype"],
categories=["WT", "PWS"] # WT as reference
)counts_nt.head()| PWS_Tet1_NT_rep1 | WT_Tet1_NT_rep1 | WT_Tet1_NT_rep3 | PWS_Tet1_NT_rep3 | WT_Tet1_NT_rep2 | PWS_Tet1_NT_rep2 | |
|---|---|---|---|---|---|---|
| Geneid | ||||||
| ENSG00000223972.4 | 2 | 2 | 6 | 1 | 3 | 2 |
| ENSG00000227232.4 | 139 | 194 | 164 | 166 | 193 | 117 |
| ENSG00000243485.2 | 0 | 0 | 0 | 0 | 0 | 0 |
| ENSG00000237613.2 | 0 | 0 | 0 | 0 | 0 | 0 |
| ENSG00000268020.2 | 0 | 0 | 0 | 0 | 0 | 0 |
meta_nt["genotype"].cat.categoriesIndex(['WT', 'PWS'], dtype='object')
meta_nt.genotype.cat.categoriesIndex(['WT', 'PWS'], dtype='object')
Run DESeq2We create a DeseqDataSet with the design formula ~ genotype and call deseq2(), which sequentially estimates size factors, gene-wise dispersions, the dispersion trend, MAP dispersions, and log-fold changes. Cook’s distance is computed to flag potential outliers.
dds = DeseqDataSet(
counts=counts_nt.T, # samples x genes
metadata=meta_nt,
design = "~genotype",
n_cpus=16,
)
dds.deseq2()Fitting size factors...
... done in 0.03 seconds.
Using None as control genes, passed at DeseqDataSet initialization
Fitting dispersions...
... done in 3.03 seconds.
Fitting dispersion trend curve...
... done in 0.71 seconds.
Fitting MAP dispersions...
... done in 2.98 seconds.
Fitting LFCs...
... done in 2.49 seconds.
Calculating cook's distance...
... done in 0.02 seconds.
Replacing 0 outlier genes.
stats = DeseqStats(dds, contrast=("genotype", "PWS", "WT"))
stats.summary()
res = stats.results_df.sort_values("padj")
resRunning Wald tests...
Log2 fold change & Wald test p-value: genotype PWS vs WT
baseMean log2FoldChange lfcSE stat pvalue \
Geneid
ENSG00000223972.4 2.770771 -1.206629 1.317184 -0.916067 0.359632
ENSG00000227232.4 161.318976 -0.360331 0.192699 -1.869922 0.061495
ENSG00000243485.2 0.000000 NaN NaN NaN NaN
ENSG00000237613.2 0.000000 NaN NaN NaN NaN
ENSG00000268020.2 0.000000 NaN NaN NaN NaN
... ... ... ... ... ...
ENSG00000198695.2 13854.096611 0.123597 0.179452 0.688748 0.490982
ENSG00000210194.1 12.741503 0.352070 0.590148 0.596580 0.550788
ENSG00000198727.2 67543.264064 0.031154 0.194265 0.160369 0.872590
ENSG00000210195.2 11.502994 -0.386682 0.632432 -0.611421 0.540921
ENSG00000210196.2 268.983200 0.653766 0.190136 3.438418 0.000585
padj
Geneid
ENSG00000223972.4 NaN
ENSG00000227232.4 0.182311
ENSG00000243485.2 NaN
ENSG00000237613.2 NaN
ENSG00000268020.2 NaN
... ...
ENSG00000198695.2 0.698662
ENSG00000210194.1 NaN
ENSG00000198727.2 0.940091
ENSG00000210195.2 NaN
ENSG00000210196.2 0.005150
[57820 rows x 6 columns]
... done in 186.64 seconds.
| baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
|---|---|---|---|---|---|---|
| Geneid | ||||||
| ENSG00000224078.8 | 12399.750658 | -9.767830 | 0.163024 | -59.916655 | 0.000000e+00 | 0.000000e+00 |
| ENSG00000128731.11 | 2854.611334 | -1.116334 | 0.062686 | -17.808451 | 6.076795e-71 | 4.536328e-67 |
| ENSG00000130985.12 | 17770.718806 | 0.854281 | 0.048161 | 17.737876 | 2.138690e-70 | 1.064355e-66 |
| ENSG00000115758.8 | 3313.616152 | 0.893573 | 0.055336 | 16.148039 | 1.172294e-58 | 4.375588e-55 |
| ENSG00000127399.10 | 400.648167 | -2.006537 | 0.137246 | -14.620025 | 2.093227e-48 | 6.250375e-45 |
| ... | ... | ... | ... | ... | ... | ... |
| ENSG00000210176.1 | 5.641433 | -0.449494 | 0.907339 | -0.495398 | 6.203189e-01 | NaN |
| ENSG00000210184.1 | 0.000000 | NaN | NaN | NaN | NaN | NaN |
| ENSG00000210191.1 | 0.906952 | -2.052813 | 2.577336 | -0.796486 | 4.257495e-01 | NaN |
| ENSG00000210194.1 | 12.741503 | 0.352070 | 0.590148 | 0.596580 | 5.507881e-01 | NaN |
| ENSG00000210195.2 | 11.502994 | -0.386682 | 0.632432 | -0.611421 | 5.409211e-01 | NaN |
57820 rows × 6 columns
stats = DeseqStats(dds, contrast=("genotype", "PWS", "WT"))
stats.summary()
res = stats.results_df.sort_values("padj")
resRunning Wald tests...
Log2 fold change & Wald test p-value: genotype PWS vs WT
baseMean log2FoldChange lfcSE stat pvalue \
Geneid
ENSG00000223972.4 2.770771 -1.206629 1.317184 -0.916067 0.359632
ENSG00000227232.4 161.318976 -0.360331 0.192699 -1.869922 0.061495
ENSG00000243485.2 0.000000 NaN NaN NaN NaN
ENSG00000237613.2 0.000000 NaN NaN NaN NaN
ENSG00000268020.2 0.000000 NaN NaN NaN NaN
... ... ... ... ... ...
ENSG00000198695.2 13854.096611 0.123597 0.179452 0.688748 0.490982
ENSG00000210194.1 12.741503 0.352070 0.590148 0.596580 0.550788
ENSG00000198727.2 67543.264064 0.031154 0.194265 0.160369 0.872590
ENSG00000210195.2 11.502994 -0.386682 0.632432 -0.611421 0.540921
ENSG00000210196.2 268.983200 0.653766 0.190136 3.438418 0.000585
padj
Geneid
ENSG00000223972.4 NaN
ENSG00000227232.4 0.182311
ENSG00000243485.2 NaN
ENSG00000237613.2 NaN
ENSG00000268020.2 NaN
... ...
ENSG00000198695.2 0.698662
ENSG00000210194.1 NaN
ENSG00000198727.2 0.940091
ENSG00000210195.2 NaN
ENSG00000210196.2 0.005150
[57820 rows x 6 columns]
... done in 2.73 seconds.
| baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | |
|---|---|---|---|---|---|---|
| Geneid | ||||||
| ENSG00000224078.8 | 12399.750658 | -9.767830 | 0.163024 | -59.916655 | 0.000000e+00 | 0.000000e+00 |
| ENSG00000128731.11 | 2854.611334 | -1.116334 | 0.062686 | -17.808451 | 6.076795e-71 | 4.536328e-67 |
| ENSG00000130985.12 | 17770.718806 | 0.854281 | 0.048161 | 17.737876 | 2.138690e-70 | 1.064355e-66 |
| ENSG00000115758.8 | 3313.616152 | 0.893573 | 0.055336 | 16.148039 | 1.172294e-58 | 4.375588e-55 |
| ENSG00000127399.10 | 400.648167 | -2.006537 | 0.137246 | -14.620025 | 2.093227e-48 | 6.250375e-45 |
| ... | ... | ... | ... | ... | ... | ... |
| ENSG00000210176.1 | 5.641433 | -0.449494 | 0.907339 | -0.495398 | 6.203189e-01 | NaN |
| ENSG00000210184.1 | 0.000000 | NaN | NaN | NaN | NaN | NaN |
| ENSG00000210191.1 | 0.906952 | -2.052813 | 2.577336 | -0.796486 | 4.257495e-01 | NaN |
| ENSG00000210194.1 | 12.741503 | 0.352070 | 0.590148 | 0.596580 | 5.507881e-01 | NaN |
| ENSG00000210195.2 | 11.502994 | -0.386682 | 0.632432 | -0.611421 | 5.409211e-01 | NaN |
57820 rows × 6 columns
Gene Symbol AnnotationEnsembl gene IDs are mapped to HGNC symbols using mygene.info. We strip version suffixes (e.g., ENSG00000224078.8 becomes ENSG00000224078) before querying.
import pandas as pd
import mygene
mg = mygene.MyGeneInfo()
# Example gene list
ensembl_ids = res.index.tolist()
# Strip version numbers
ensembl_base = [g.split(".")[0] for g in ensembl_ids]
# Query
resMap = mg.querymany(
ensembl_base,
scopes="ensembl.gene",
fields=["symbol", "name"],
species="human"
)
df = pd.DataFrame(resMap)[["query", "symbol", "name"]]
dfInput sequence provided is already in string format. No operation performed
querySymDict = df.set_index('query')['symbol'].to_dict()res.loc[:,'symbol'] = res.index.str.split(".").str[0].map(querySymDict).values
keepRes = res[~res.symbol.isna()]
keepRes.to_csv('de_genes_pws_gersbach.csv')
keepRes = keepRes[~((keepRes['symbol'].str.contains('MT-', na=False))|(keepRes['symbol'].str.contains('MIR', na=False)))]
Volcano PlotThe volcano plot shows log2 fold change (x-axis) vs. statistical significance as \(-\log_{10}(\text{adjusted } p)\) (y-axis). Dashed lines mark the thresholds: |log2FC| > 1 and adjusted p-value < 0.05. Known PWS-region genes (e.g., SNRPN, NDN, MAGEL2, MKRN3) are labeled to verify that the expected locus is captured.
import numpy as npimport plotly.graph_objects as go# Copy to avoid modifying originaldf_plot = keepRes.copy()# Remove NA padjdf_plot = df_plot[df_plot["padj"].notna()]# Compute -log10 adjusted p-valuedf_plot["neg_log10_padj"] = -np.log10(df_plot["padj"])# Thresholdslfc_thresh = 1padj_thresh = 0.05# Significant masksig = (df_plot["padj"] < padj_thresh) & (abs(df_plot["log2FoldChange"]) > lfc_thresh)# PWS-region genes to labelpws_genes_human = [ "MKRN3", "MAGEL2", "NDN", "PWRN1", "NPAP1", "SNRPN", "SNORD64", "SNORD116-13", "SNORD108", "SNORD109A", "SNORD109B", "SNHG14", "IPW", "CAT", "CYP4F22",]is_pws = df_plot["symbol"].isin(pws_genes_human)fig = go.Figure()# Non-significant genesns = df_plot[~sig & ~is_pws]fig.add_trace(go.Scatter( x=ns["log2FoldChange"], y=ns["neg_log10_padj"], mode="markers", marker=dict(size=4, color="lightgrey", opacity=0.5), text=ns["symbol"], name="NS", hovertemplate="%{text}<br>log2FC: %{x:.2f}<br>-log10 padj: %{y:.2f}<extra></extra>",))# Significant genes (non-PWS)s = df_plot[sig & ~is_pws]fig.add_trace(go.Scatter( x=s["log2FoldChange"], y=s["neg_log10_padj"], mode="markers", marker=dict(size=5, color="steelblue", opacity=0.7), text=s["symbol"], name="Significant", hovertemplate="%{text}<br>log2FC: %{x:.2f}<br>-log10 padj: %{y:.2f}<extra></extra>",))# PWS-region genes (highlighted + labeled)pws = df_plot[is_pws]fig.add_trace(go.Scatter( x=pws["log2FoldChange"], y=pws["neg_log10_padj"], mode="markers+text", marker=dict(size=8, color="crimson", symbol="diamond"), text=pws["symbol"], textposition="top center", textfont=dict(size=10), name="PWS-region", hovertemplate="%{text}<br>log2FC: %{x:.2f}<br>-log10 padj: %{y:.2f}<extra></extra>",))# Threshold linesfig.add_hline(y=-np.log10(padj_thresh), line_dash="dash", line_color="grey", opacity=0.5)fig.add_vline(x=lfc_thresh, line_dash="dash", line_color="grey", opacity=0.5)fig.add_vline(x=-lfc_thresh, line_dash="dash", line_color="grey", opacity=0.5)fig.update_layout( title="Volcano Plot — PWS vs WT (NT samples)", xaxis_title="log2 Fold Change", yaxis_title="-log10 adjusted p-value", template="plotly_white", width=900, height=700, legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),)# Use HTML display so the plot embeds in static nbconvert exportfrom IPython.display import HTMLfig.show()HTML(fig.to_html(include_plotlyjs="cdn", full_html=False))