Fitness Regression
In [1]:
Copied!
import os
import tempfile
cache_dir = tempfile.mkdtemp()
os.environ['XDG_CACHE_HOME'] = cache_dir
import utils.trees as ut
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_rel
import os
import tempfile
cache_dir = tempfile.mkdtemp()
os.environ['XDG_CACHE_HOME'] = cache_dir
import utils.trees as ut
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_rel
In [2]:
Copied!
results = "../nextflow/results/"
ranking_subdir = 'naive_reversions_first'
outbase = "output/fitness-regression/"
metadata_csv = "../gc_metadata.csv"
results = "../nextflow/results/"
ranking_subdir = 'naive_reversions_first'
outbase = "output/fitness-regression/"
metadata_csv = "../gc_metadata.csv"
In [3]:
Copied!
# Parameters
results = "."
ranking_subdir = "naive_reversions_first"
metadata_csv = "gc_metadata.csv"
outbase = "."
# Parameters
results = "."
ranking_subdir = "naive_reversions_first"
metadata_csv = "gc_metadata.csv"
outbase = "."
In [4]:
Copied!
output_dir = f"{outbase}/{ranking_subdir}"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
output_dir = f"{outbase}/{ranking_subdir}"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
In [5]:
Copied!
metadata = pd.read_csv(metadata_csv, index_col=0)
metadata.query("(strain == 'wt') & (cell_type == 'GC') & (imm_duration != 'w10')", inplace=True)
metadata.rename(columns={'imm_duration': 'time'}, inplace=True)
metadata
metadata = pd.read_csv(metadata_csv, index_col=0)
metadata.query("(strain == 'wt') & (cell_type == 'GC') & (imm_duration != 'w10')", inplace=True)
metadata.rename(columns={'imm_duration': 'time'}, inplace=True)
metadata
Out[5]:
ngs_id | time | mouse | gc | strain | node | cell_type | plate | hc_barcode | lc_barcode | row | col | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
uid | ||||||||||||
D15_M1_GC1 | PR-2-01 | d15 | 1 | 1 | wt | RP | GC | 2 | 9 | 9 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D15_M1_GC2 | PR-2-01 | d15 | 1 | 2 | wt | RI | GC | 3 | 2 | 1 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D15_M1_GC3 | PR-2-01 | d15 | 1 | 3 | wt | LI | GC | 4 | 14 | 2 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D15_M2_GC4 | PR-2-01 | d15 | 2 | 4 | wt | RP | GC | 5 | 10 | 11 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D15_M3_GC5 | PR-2-01 | d15 | 3 | 5 | wt | RP | GC | 6 | 7 | 4 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
D20_M24_GC115 | PR-1-04 | d20 | 24 | 115 | wt | RP | GC | 72 | 16 | 16 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D20_M25_GC116 | PR-1-02 | d20 | 25 | 116 | wt | RP | GC | 65 | 8 | 8 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D20_M25_GC117 | PR-1-03 | d20 | 25 | 117 | wt | RP | GC | 68 | 9 | 9 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D20_M25_GC118 | PR-1-08 | d20 | 25 | 118 | wt | RP | GC | 66 | 15 | 15 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
D20_M25_GC119 | PR-1-08 | d20 | 25 | 119 | wt | RP | GC | 67 | 16 | 16 | A.B.C.D.E.F.G.H | 1.2.3.4.5.6.7.8.9.10.11.12 |
119 rows × 12 columns
Load trees from results directory¶
In [6]:
Copied!
trees = ut.load_trees(metadata, results, ranking_subdir)
print(f"loaded {len(trees)} GC trees out of {len(metadata)} in metadata")
trees = ut.load_trees(metadata, results, ranking_subdir)
print(f"loaded {len(trees)} GC trees out of {len(metadata)} in metadata")
loaded 119 GC trees out of 119 in metadata
Affinity node feauture¶
In [7]:
Copied!
delta_bind_attr = "delta_bind"
delta_bind_attr = "delta_bind"
Data frame with a row for each node on each tree¶
In [8]:
Copied!
rows = []
for gc, tree in trees.items():
total_abundance = sum(node.abundance for node in tree.tree.traverse())
for node in tree.tree.traverse():
root_distance_nt = node.get_distance(tree.tree)
descendant_distances_nt = [node.get_distance(descendant)
for descendant in node.traverse()
if descendant.abundance]
closest_leaf_distance_nt = min(descendant_distances_nt)
farthest_leaf_distance_nt = max(descendant_distances_nt)
branch_length = sum(node.dist for node in tree.tree.traverse())
# compare nodes (e.g. bursts) with their sisters
if node.up is not None and len(node.up.children) > 1:
mean_sister_delta_bind = np.nanmean([getattr(node2, delta_bind_attr)
for node2 in node.up.children
if node2.name != node.name])
else:
mean_sister_delta_bind = np.nan
rows.append([gc,
metadata.time[gc],
node.name,
node.up.name if node.up else np.nan,
";".join(node.mutations),
";".join(set(node.mutations) - (set(node.up.mutations) if node.up is not None else set())),
";".join((set(node.up.mutations) if node.up is not None else set()) - set(node.mutations)),
node.abundance,
len(node.children),
node.abundance + len(node.children),
node.LBI,
node.LBR,
node.REI,
sum(node.LB_down.values()) / branch_length,
root_distance_nt,
closest_leaf_distance_nt,
farthest_leaf_distance_nt,
getattr(node, delta_bind_attr),
getattr(node.up, delta_bind_attr) if node.up else np.nan,
getattr(node, delta_bind_attr) - getattr(node.up, delta_bind_attr) if node.up is not None else np.nan,
node.delta_expr,
mean_sister_delta_bind,
])
df = pd.DataFrame(rows, columns=["GC",
"time",
"name",
"parent_name",
"aa_mutations",
"aa_mutations_edge_gained",
"aa_mutations_edge_lost",
"abundance",
"n_mutant_edges",
"polytomy_degree",
"LBI",
"LBR",
"REI",
"LBF",
"root_distance_nt",
"closest_leaf_distance_nt",
"farthest_leaf_distance_nt",
"delta_bind",
"parent_delta_bind",
"delta_bind_edge",
"delta_expr",
"mean_sister_delta_bind",
])
df
rows = []
for gc, tree in trees.items():
total_abundance = sum(node.abundance for node in tree.tree.traverse())
for node in tree.tree.traverse():
root_distance_nt = node.get_distance(tree.tree)
descendant_distances_nt = [node.get_distance(descendant)
for descendant in node.traverse()
if descendant.abundance]
closest_leaf_distance_nt = min(descendant_distances_nt)
farthest_leaf_distance_nt = max(descendant_distances_nt)
branch_length = sum(node.dist for node in tree.tree.traverse())
# compare nodes (e.g. bursts) with their sisters
if node.up is not None and len(node.up.children) > 1:
mean_sister_delta_bind = np.nanmean([getattr(node2, delta_bind_attr)
for node2 in node.up.children
if node2.name != node.name])
else:
mean_sister_delta_bind = np.nan
rows.append([gc,
metadata.time[gc],
node.name,
node.up.name if node.up else np.nan,
";".join(node.mutations),
";".join(set(node.mutations) - (set(node.up.mutations) if node.up is not None else set())),
";".join((set(node.up.mutations) if node.up is not None else set()) - set(node.mutations)),
node.abundance,
len(node.children),
node.abundance + len(node.children),
node.LBI,
node.LBR,
node.REI,
sum(node.LB_down.values()) / branch_length,
root_distance_nt,
closest_leaf_distance_nt,
farthest_leaf_distance_nt,
getattr(node, delta_bind_attr),
getattr(node.up, delta_bind_attr) if node.up else np.nan,
getattr(node, delta_bind_attr) - getattr(node.up, delta_bind_attr) if node.up is not None else np.nan,
node.delta_expr,
mean_sister_delta_bind,
])
df = pd.DataFrame(rows, columns=["GC",
"time",
"name",
"parent_name",
"aa_mutations",
"aa_mutations_edge_gained",
"aa_mutations_edge_lost",
"abundance",
"n_mutant_edges",
"polytomy_degree",
"LBI",
"LBR",
"REI",
"LBF",
"root_distance_nt",
"closest_leaf_distance_nt",
"farthest_leaf_distance_nt",
"delta_bind",
"parent_delta_bind",
"delta_bind_edge",
"delta_expr",
"mean_sister_delta_bind",
])
df
/tmp/ipykernel_350/1657210563.py:20: RuntimeWarning: Mean of empty slice mean_sister_delta_bind = np.nanmean([getattr(node2, delta_bind_attr)
Out[8]:
GC | time | name | parent_name | aa_mutations | aa_mutations_edge_gained | aa_mutations_edge_lost | abundance | n_mutant_edges | polytomy_degree | ... | REI | LBF | root_distance_nt | closest_leaf_distance_nt | farthest_leaf_distance_nt | delta_bind | parent_delta_bind | delta_bind_edge | delta_expr | mean_sister_delta_bind | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | D15_M1_GC1 | d15 | naive | NaN | 0 | 11 | 11 | ... | 0.082323 | 0.107784 | 0.0 | 1.0 | 9.0 | 0.00000 | NaN | NaN | 0.00000 | NaN | |||
1 | D15_M1_GC1 | d15 | seq18 | naive | Q8(L)H;K9(L)T | Q8(L)H;K9(L)T | 1 | 0 | 1 | ... | 0.014925 | 0.000000 | 3.0 | 0.0 | 0.0 | 0.02218 | 0.00000 | 0.02218 | 0.13093 | 0.000163 | |
2 | D15_M1_GC1 | d15 | seq35 | naive | S64(H)N;L116(L)P;L124(L)V | S64(H)N;L116(L)P;L124(L)V | 1 | 0 | 1 | ... | 0.014925 | 0.000000 | 3.0 | 0.0 | 0.0 | 0.09485 | 0.00000 | 0.09485 | -0.00361 | -0.007104 | |
3 | D15_M1_GC1 | d15 | seq48 | naive | S128(H)L;T37(L)S;Q44(L)H;N108(L)S | Q44(L)H;T37(L)S;N108(L)S;S128(H)L | 1 | 0 | 1 | ... | 0.014925 | 0.000000 | 4.0 | 0.0 | 0.0 | 0.75640 | 0.00000 | 0.75640 | -0.05788 | -0.073259 | |
4 | D15_M1_GC1 | d15 | seq15 | naive | K84(H)R;R106(H)K;K123(L)E | K84(H)R;R106(H)K;K123(L)E | 1 | 0 | 1 | ... | 0.014925 | 0.000000 | 3.0 | 0.0 | 0.0 | -0.37822 | 0.00000 | -0.37822 | -0.02799 | 0.040203 | |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9717 | D20_M25_GC119 | d20 | seq21 | seq13 | S57(H)N;A105(H)G;T37(L)S;A52(L)S;S92(L)T;E101(... | T37(L)S;S57(H)N;A52(L)S;E101(L)D | 1 | 0 | 1 | ... | 0.012346 | 0.000000 | 13.0 | 0.0 | 0.0 | 1.68377 | 1.03820 | 0.64557 | -0.12751 | 1.221785 | |
9718 | D20_M25_GC119 | d20 | seq24 | seq13 | A105(H)G;S92(L)T;Q105(L)H;S109(L)I | 1 | 0 | 1 | ... | 0.012346 | 0.000000 | 9.0 | 0.0 | 0.0 | 1.03820 | 1.03820 | 0.00000 | 0.02679 | 1.383177 | ||
9719 | D20_M25_GC119 | d20 | seq45 | seq13 | A105(H)G;S92(L)T;A100(L)G;Q105(L)H;S109(L)M | S109(L)M;A100(L)G | S109(L)I | 1 | 0 | 1 | ... | 0.012346 | 0.000000 | 10.0 | 0.0 | 0.0 | 1.29038 | 1.03820 | 0.25218 | 0.02708 | 1.320133 |
9720 | D20_M25_GC119 | d20 | seq70 | seq13 | A105(H)G;T88(L)A;S92(L)T;Q105(L)H;N108(L)I;S10... | N108(L)I;T88(L)A | 1 | 0 | 1 | ... | 0.012346 | 0.000000 | 11.0 | 0.0 | 0.0 | 1.52036 | 1.03820 | 0.48216 | 0.07518 | 1.262638 | |
9721 | D20_M25_GC119 | d20 | seq53 | seq18 | K84(H)R;Y88(H)H;Q90(H)R;A105(H)G;S49(L)P;D86(L... | D86(L)Y | 1 | 0 | 1 | ... | 0.012346 | 0.000000 | 10.0 | 0.0 | 0.0 | 1.51794 | 1.50631 | 0.01163 | -0.19858 | NaN |
9722 rows × 22 columns
In [9]:
Copied!
# check that names are unique
df.groupby(["GC", "name"]).size().unique()
# check that names are unique
df.groupby(["GC", "name"]).size().unique()
Out[9]:
array([1])
In [10]:
Copied!
df.to_csv(f"{output_dir}/data.csv", index=False)
df.to_csv(f"{output_dir}/data.csv", index=False)
In [11]:
Copied!
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="n_mutant_edges", label="n_mutant_edges")
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="abundance", label="abundance")
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="delta_bind", label="delta_bind")
plt.ylabel("")
plt.show()
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="n_mutant_edges", label="n_mutant_edges")
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="abundance", label="abundance")
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="delta_bind", label="delta_bind")
plt.ylabel("")
plt.show()
In [12]:
Copied!
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="REI", label="REI")
sns.lineplot(data=df, x="farthest_leaf_distance_nt", y="REI", label="REI")
Out[12]:
<Axes: xlabel='farthest_leaf_distance_nt', ylabel='REI'>
In [13]:
Copied!
plt.figure(figsize=(6, 3))
plt.subplot(121)
sns.histplot(data=df.query("time=='d15'"), x="delta_bind", bins=120)
plt.title("day 15")
plt.xlim(-3, 3)
plt.subplot(122)
sns.histplot(data=df.query("time=='d20'"), x="delta_bind", bins=120)
plt.title("day 20")
plt.xlim(-3, 3)
plt.tight_layout()
plt.show()
plt.figure(figsize=(6, 3))
plt.subplot(121)
sns.histplot(data=df.query("time=='d15'"), x="delta_bind_edge", bins=120)
plt.title("day 15")
plt.xlim(-2, 2)
plt.subplot(122)
sns.histplot(data=df.query("time=='d20'"), x="delta_bind_edge", bins=120)
plt.title("day 20")
plt.xlim(-2, 2)
plt.tight_layout()
plt.show()
plt.figure(figsize=(6, 3))
plt.subplot(121)
sns.histplot(data=df.query("time=='d15'"), x="delta_bind", bins=120)
plt.title("day 15")
plt.xlim(-3, 3)
plt.subplot(122)
sns.histplot(data=df.query("time=='d20'"), x="delta_bind", bins=120)
plt.title("day 20")
plt.xlim(-3, 3)
plt.tight_layout()
plt.show()
plt.figure(figsize=(6, 3))
plt.subplot(121)
sns.histplot(data=df.query("time=='d15'"), x="delta_bind_edge", bins=120)
plt.title("day 15")
plt.xlim(-2, 2)
plt.subplot(122)
sns.histplot(data=df.query("time=='d20'"), x="delta_bind_edge", bins=120)
plt.title("day 20")
plt.xlim(-2, 2)
plt.tight_layout()
plt.show()
In [14]:
Copied!
gain_thresh = 0.3
loss_thresh = -1
# absolute affinity strata
df.loc[~df.delta_bind.isna(), "affinity stratum"] = "affinity neutral"
df.loc[df["delta_bind"] > gain_thresh, "affinity stratum"] = "affinity gain"
df.loc[df["delta_bind"] < loss_thresh, "affinity stratum"] = "affinity loss"
# affinity change strata
df.loc[~df.delta_bind_edge.isna(), "affinity change stratum"] = "affinity neutral"
df.loc[df["delta_bind_edge"] > gain_thresh, "affinity change stratum"] = "affinity gain"
df.loc[df["delta_bind_edge"] < loss_thresh, "affinity change stratum"] = "affinity loss"
palette = {"affinity loss": "red", "affinity neutral": "grey", "affinity gain": "blue"}
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
g = sns.JointGrid(data=df_time,
x="root_distance_nt",
y="delta_bind_edge",
xlim=(0, df_time.root_distance_nt.max()),
ylim=(-4, 2),
height=8, ratio=3)
g = g.plot_joint(sns.histplot,
bins=(16, 120),
vmax=50,
cmap="viridis",
cbar=True,
cbar_ax=plt.gcf().add_axes([0.59, 0.72, 0.15, 0.02]),
cbar_kws=dict(orientation='horizontal', label="number of nodes"),
)
sns.histplot(ax=g.ax_marg_y,
y=df_time.loc[~df_time.delta_bind_edge.isna(), "delta_bind_edge"],
hue=df_time.loc[~df_time.delta_bind_edge.isna(), "affinity change stratum"],
bins=120,
palette=palette,
fill=True, element="step", linewidth=1,
legend=False)
g.ax_marg_y.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_marg_y.axhline(loss_thresh, c="red", ls="--", lw=1)
g.ax_joint.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_joint.axhline(loss_thresh, c="red", ls="--", lw=1)
g.set_axis_labels(
"divergence from naive root",
r"local affinity $\Delta$",
)
plt.savefig(f"{output_dir}/heatmap_{time}.pdf")
plt.show()
gain_thresh = 0.3
loss_thresh = -1
# absolute affinity strata
df.loc[~df.delta_bind.isna(), "affinity stratum"] = "affinity neutral"
df.loc[df["delta_bind"] > gain_thresh, "affinity stratum"] = "affinity gain"
df.loc[df["delta_bind"] < loss_thresh, "affinity stratum"] = "affinity loss"
# affinity change strata
df.loc[~df.delta_bind_edge.isna(), "affinity change stratum"] = "affinity neutral"
df.loc[df["delta_bind_edge"] > gain_thresh, "affinity change stratum"] = "affinity gain"
df.loc[df["delta_bind_edge"] < loss_thresh, "affinity change stratum"] = "affinity loss"
palette = {"affinity loss": "red", "affinity neutral": "grey", "affinity gain": "blue"}
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
g = sns.JointGrid(data=df_time,
x="root_distance_nt",
y="delta_bind_edge",
xlim=(0, df_time.root_distance_nt.max()),
ylim=(-4, 2),
height=8, ratio=3)
g = g.plot_joint(sns.histplot,
bins=(16, 120),
vmax=50,
cmap="viridis",
cbar=True,
cbar_ax=plt.gcf().add_axes([0.59, 0.72, 0.15, 0.02]),
cbar_kws=dict(orientation='horizontal', label="number of nodes"),
)
sns.histplot(ax=g.ax_marg_y,
y=df_time.loc[~df_time.delta_bind_edge.isna(), "delta_bind_edge"],
hue=df_time.loc[~df_time.delta_bind_edge.isna(), "affinity change stratum"],
bins=120,
palette=palette,
fill=True, element="step", linewidth=1,
legend=False)
g.ax_marg_y.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_marg_y.axhline(loss_thresh, c="red", ls="--", lw=1)
g.ax_joint.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_joint.axhline(loss_thresh, c="red", ls="--", lw=1)
g.set_axis_labels(
"divergence from naive root",
r"local affinity $\Delta$",
)
plt.savefig(f"{output_dir}/heatmap_{time}.pdf")
plt.show()
d15
d20
In [15]:
Copied!
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
g = sns.JointGrid(data=df_time, x=np.log(df_time.REI),
y="delta_bind_edge",
ylim=(-4, 2),
height=8, ratio=3)
g = g.plot_joint(sns.histplot,
bins=(16, 120),
vmax=50,
cmap="viridis",
cbar=True,
cbar_ax=plt.gcf().add_axes([0.59, 0.72, 0.15, 0.02]),
cbar_kws=dict(orientation='horizontal', label="number of nodes"),
)
sns.histplot(ax=g.ax_marg_y,
y=df_time.loc[~df_time.delta_bind_edge.isna(), "delta_bind_edge"], hue=df_time.loc[~df_time.delta_bind_edge.isna(), "affinity change stratum"],
bins=120,
palette=palette,
fill=True, element="step", linewidth=1,
legend=False)
g.ax_marg_y.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_marg_y.axhline(loss_thresh, c="red", ls="--", lw=1)
g.ax_joint.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_joint.axhline(loss_thresh, c="red", ls="--", lw=1)
g.set_axis_labels(r"$\log_{10}$ REI", r"local affinity $\Delta$")
plt.savefig(f"{output_dir}/heatmap2_{time}.pdf")
plt.show()
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
g = sns.JointGrid(data=df_time, x=np.log(df_time.REI),
y="delta_bind_edge",
ylim=(-4, 2),
height=8, ratio=3)
g = g.plot_joint(sns.histplot,
bins=(16, 120),
vmax=50,
cmap="viridis",
cbar=True,
cbar_ax=plt.gcf().add_axes([0.59, 0.72, 0.15, 0.02]),
cbar_kws=dict(orientation='horizontal', label="number of nodes"),
)
sns.histplot(ax=g.ax_marg_y,
y=df_time.loc[~df_time.delta_bind_edge.isna(), "delta_bind_edge"], hue=df_time.loc[~df_time.delta_bind_edge.isna(), "affinity change stratum"],
bins=120,
palette=palette,
fill=True, element="step", linewidth=1,
legend=False)
g.ax_marg_y.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_marg_y.axhline(loss_thresh, c="red", ls="--", lw=1)
g.ax_joint.axhline(gain_thresh, c="blue", ls="--", lw=1)
g.ax_joint.axhline(loss_thresh, c="red", ls="--", lw=1)
g.set_axis_labels(r"$\log_{10}$ REI", r"local affinity $\Delta$")
plt.savefig(f"{output_dir}/heatmap2_{time}.pdf")
plt.show()
d15
d20
In [16]:
Copied!
df["divergence stratum"] = pd.cut(df.root_distance_nt, [0, 1, 2, 4, 8, df.root_distance_nt.max()])
df["leaf distance stratum"] = pd.cut(df.farthest_leaf_distance_nt, [0, 2, 5, 10, df.farthest_leaf_distance_nt.max()])
# branching_stat = "LBI"
# branching_stat = "LBR"
branching_stat = "REI"
linthresh = 0.015
bins1 = [0] + list(np.geomspace(linthresh, df[branching_stat].max() + .01, 12))
bins2 = [0] + list(np.geomspace(linthresh, df[branching_stat].max() + .01, 5))
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
plt.figure(figsize=(6, 3))
plt.subplot(121)
sns.histplot(data=df_time.loc[~df_time.delta_bind_edge.isna()],
x="root_distance_nt",
hue="affinity change stratum",
bins=10,
stat="probability", common_norm=False,
palette=palette,
fill=False, element="poly", linewidth=2,
legend=False)
plt.xlabel("divergence from naive root")
plt.subplot(122)
sns.histplot(data=df_time.loc[~df_time.delta_bind_edge.isna()], x=branching_stat,
hue="affinity change stratum",
bins=bins1,
stat="probability", common_norm=False,
palette=palette,
fill=False, element="poly", linewidth=2,
legend=False)
plt.xlabel(branching_stat)
plt.xscale("log")
plt.tight_layout()
plt.savefig(f"{output_dir}/divergence-{branching_stat}_{time}.pdf")
plt.show()
g = sns.displot(data=df_time.loc[~df_time.delta_bind_edge.isna()],
col="divergence stratum",
x=branching_stat,
hue="affinity change stratum",
hue_order=palette.keys(),
palette=palette,
bins=bins2,
stat="probability",
common_norm=False,
aspect=0.7, height=2,
fill=False, element="poly", linewidth=2,
legend=False
)
g.set_titles(col_template="{col_name}")
plt.xscale("log")
plt.savefig(f"{output_dir}/divergence-stratified-{branching_stat}_{time}.pdf")
plt.show()
df["divergence stratum"] = pd.cut(df.root_distance_nt, [0, 1, 2, 4, 8, df.root_distance_nt.max()])
df["leaf distance stratum"] = pd.cut(df.farthest_leaf_distance_nt, [0, 2, 5, 10, df.farthest_leaf_distance_nt.max()])
# branching_stat = "LBI"
# branching_stat = "LBR"
branching_stat = "REI"
linthresh = 0.015
bins1 = [0] + list(np.geomspace(linthresh, df[branching_stat].max() + .01, 12))
bins2 = [0] + list(np.geomspace(linthresh, df[branching_stat].max() + .01, 5))
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
plt.figure(figsize=(6, 3))
plt.subplot(121)
sns.histplot(data=df_time.loc[~df_time.delta_bind_edge.isna()],
x="root_distance_nt",
hue="affinity change stratum",
bins=10,
stat="probability", common_norm=False,
palette=palette,
fill=False, element="poly", linewidth=2,
legend=False)
plt.xlabel("divergence from naive root")
plt.subplot(122)
sns.histplot(data=df_time.loc[~df_time.delta_bind_edge.isna()], x=branching_stat,
hue="affinity change stratum",
bins=bins1,
stat="probability", common_norm=False,
palette=palette,
fill=False, element="poly", linewidth=2,
legend=False)
plt.xlabel(branching_stat)
plt.xscale("log")
plt.tight_layout()
plt.savefig(f"{output_dir}/divergence-{branching_stat}_{time}.pdf")
plt.show()
g = sns.displot(data=df_time.loc[~df_time.delta_bind_edge.isna()],
col="divergence stratum",
x=branching_stat,
hue="affinity change stratum",
hue_order=palette.keys(),
palette=palette,
bins=bins2,
stat="probability",
common_norm=False,
aspect=0.7, height=2,
fill=False, element="poly", linewidth=2,
legend=False
)
g.set_titles(col_template="{col_name}")
plt.xscale("log")
plt.savefig(f"{output_dir}/divergence-stratified-{branching_stat}_{time}.pdf")
plt.show()
d15
d20
In [17]:
Copied!
from scipy.stats import ttest_ind
top = 20
REI_thresh = 0.25
NDS_thresh = 0.7
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
df_time_top = df_time.query(f"REI>{REI_thresh} & LBR>{NDS_thresh}")
plt.figure(figsize=(4.5, 4))
sns.histplot(data=df_time, x="delta_bind")
sns.rugplot(data=df_time_top, x="delta_bind", color="C1", height=0.1, label=f"REI>{REI_thresh} & NDS>{NDS_thresh}", lw=2)
plt.axvline(0, c="k", ls="--", lw=1)
plt.xlabel("affinity")
plt.xlim(-5, None)
plt.legend(loc="upper left")
print(f'REI: p={ttest_ind(df_time_top.delta_bind, df_time.delta_bind[~df_time.index.isin(df_time_top.index)], nan_policy="omit").pvalue:.2f}')
plt.tight_layout()
plt.savefig(f"{output_dir}/topREI_{time}.pdf")
plt.show()
from scipy.stats import ttest_ind
top = 20
REI_thresh = 0.25
NDS_thresh = 0.7
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
df_time_top = df_time.query(f"REI>{REI_thresh} & LBR>{NDS_thresh}")
plt.figure(figsize=(4.5, 4))
sns.histplot(data=df_time, x="delta_bind")
sns.rugplot(data=df_time_top, x="delta_bind", color="C1", height=0.1, label=f"REI>{REI_thresh} & NDS>{NDS_thresh}", lw=2)
plt.axvline(0, c="k", ls="--", lw=1)
plt.xlabel("affinity")
plt.xlim(-5, None)
plt.legend(loc="upper left")
print(f'REI: p={ttest_ind(df_time_top.delta_bind, df_time.delta_bind[~df_time.index.isin(df_time_top.index)], nan_policy="omit").pvalue:.2f}')
plt.tight_layout()
plt.savefig(f"{output_dir}/topREI_{time}.pdf")
plt.show()
d15 REI: p=0.19
d20 REI: p=0.25
Comparison to sisters¶
For statistical independence, we can only take one node—the max REI node—from each tree
In [18]:
Copied!
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
# max REI row per GC
df_max = df_time.loc[df_time.groupby("GC")["REI"].idxmax()]
df_max["sister_excess"] = df_max.delta_bind - df_max.mean_sister_delta_bind
# df_max = df_max[["delta_bind", "mean_sister_delta_bind", "sister_excess", "REI"]].dropna()
df_max.to_csv(f"{output_dir}/data_maxREI_{time}.csv", index=False)
# plot of sister excess vs. REI
plt.figure(figsize=(3, 3))
sns.scatterplot(data=df_max,
y="sister_excess", x="REI", alpha=0.8)
plt.axhline(0, c="k", ls="--", lw=1)
plt.show()
# two sample test
# df_max_nonan = df_max[["delta_bind", "mean_sister_delta_bind"]].dropna()
display(ttest_rel(df_max.delta_bind, df_max.mean_sister_delta_bind, alternative="greater"))
plt.figure(figsize=(3, 3))
plt.plot([[0, 1]] * len(df_max), df_max[["delta_bind", "mean_sister_delta_bind"]].values, "o", c="k", ms=5, alpha=0.25)
for i, (x, y) in enumerate(df_max[["delta_bind", "mean_sister_delta_bind"]].values):
plt.plot([0, 1], [x, y], c="k", lw=1, alpha=0.25)
plt.xticks([0, 1], ["bursts", "mean of bursts sisters"])
plt.ylabel("affinity")
plt.xlim(-0.25, 1.25)
plt.ylim(df.loc[df.groupby("GC")["REI"].idxmax()][["delta_bind", "mean_sister_delta_bind"]].min().min() - 0.2, df.loc[df.groupby("GC")["REI"].idxmax()][["delta_bind", "mean_sister_delta_bind"]].max().max() + 0.2)
plt.tight_layout()
plt.savefig(f"{output_dir}/sister-affinity_{time}.pdf")
plt.show()
plt.figure(figsize=(5, 4))
plt.scatter(data=df_max, y="delta_bind", x="mean_sister_delta_bind", c="REI", cmap="viridis", alpha=0.8, vmin=df.REI.min(), vmax=df.REI.max())
plt.colorbar(label="REI")
plt.plot([-2, 2], [-2, 2], c="k", ls="--", lw=1)
plt.xlabel("mean affinity of sister nodes")
plt.ylabel("node affinity")
plt.tight_layout()
plt.savefig(f"{output_dir}/sister-affinity2_{time}.pdf")
plt.show()
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
# max REI row per GC
df_max = df_time.loc[df_time.groupby("GC")["REI"].idxmax()]
df_max["sister_excess"] = df_max.delta_bind - df_max.mean_sister_delta_bind
# df_max = df_max[["delta_bind", "mean_sister_delta_bind", "sister_excess", "REI"]].dropna()
df_max.to_csv(f"{output_dir}/data_maxREI_{time}.csv", index=False)
# plot of sister excess vs. REI
plt.figure(figsize=(3, 3))
sns.scatterplot(data=df_max,
y="sister_excess", x="REI", alpha=0.8)
plt.axhline(0, c="k", ls="--", lw=1)
plt.show()
# two sample test
# df_max_nonan = df_max[["delta_bind", "mean_sister_delta_bind"]].dropna()
display(ttest_rel(df_max.delta_bind, df_max.mean_sister_delta_bind, alternative="greater"))
plt.figure(figsize=(3, 3))
plt.plot([[0, 1]] * len(df_max), df_max[["delta_bind", "mean_sister_delta_bind"]].values, "o", c="k", ms=5, alpha=0.25)
for i, (x, y) in enumerate(df_max[["delta_bind", "mean_sister_delta_bind"]].values):
plt.plot([0, 1], [x, y], c="k", lw=1, alpha=0.25)
plt.xticks([0, 1], ["bursts", "mean of bursts sisters"])
plt.ylabel("affinity")
plt.xlim(-0.25, 1.25)
plt.ylim(df.loc[df.groupby("GC")["REI"].idxmax()][["delta_bind", "mean_sister_delta_bind"]].min().min() - 0.2, df.loc[df.groupby("GC")["REI"].idxmax()][["delta_bind", "mean_sister_delta_bind"]].max().max() + 0.2)
plt.tight_layout()
plt.savefig(f"{output_dir}/sister-affinity_{time}.pdf")
plt.show()
plt.figure(figsize=(5, 4))
plt.scatter(data=df_max, y="delta_bind", x="mean_sister_delta_bind", c="REI", cmap="viridis", alpha=0.8, vmin=df.REI.min(), vmax=df.REI.max())
plt.colorbar(label="REI")
plt.plot([-2, 2], [-2, 2], c="k", ls="--", lw=1)
plt.xlabel("mean affinity of sister nodes")
plt.ylabel("node affinity")
plt.tight_layout()
plt.savefig(f"{output_dir}/sister-affinity2_{time}.pdf")
plt.show()
d15
TtestResult(statistic=np.float64(nan), pvalue=np.float64(nan), df=np.float64(nan))
d20
TtestResult(statistic=np.float64(nan), pvalue=np.float64(nan), df=np.float64(nan))
Are terrible nodes usually leaves?¶
Note: need to aggregate to a GC-level metric, so we can properly test
In [19]:
Copied!
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
df_groups = df_time.groupby(["GC", "affinity change stratum"]).closest_leaf_distance_nt.mean().reset_index()
plt.figure(figsize=(5, 3))
sns.histplot(data=df_groups,
hue="affinity change stratum", x="closest_leaf_distance_nt",
hue_order=("affinity loss", "affinity neutral", "affinity gain"),
multiple="stack",
bins=np.linspace(0, df.groupby(["GC", "affinity change stratum"]).closest_leaf_distance_nt.mean().max(), 30),
palette=palette,
)
plt.xlabel("average branch length to closest leaf")
plt.ylabel("number of GCs")
plt.tight_layout()
plt.savefig(f"{output_dir}/terrible-histogram_{time}.pdf")
plt.show()
diff_stat = df_groups.query("`affinity change stratum` == 'affinity loss'").set_index("GC").closest_leaf_distance_nt - df_groups.query("`affinity change stratum` == 'affinity gain'").set_index("GC").closest_leaf_distance_nt
from scipy.stats import ttest_1samp
p = ttest_1samp(diff_stat, 0).pvalue
sns.histplot(diff_stat, bins=15)
plt.xlabel("difference in average branch length to closest\nleaf between affinity loss and affinity gain nodes")
plt.ylabel("number of GCs")
plt.axvline(0, c="k", ls="--", lw=2)
plt.annotate(f"p={p:.2e}", (0.1, 0.9), xycoords="axes fraction", va="center", ha="center")
plt.savefig(f"{output_dir}/terrible-significance_{time}.pdf")
plt.show()
for time in ("d15", "d20"):
print(time)
df_time = df.query(f"time=='{time}'")
df_groups = df_time.groupby(["GC", "affinity change stratum"]).closest_leaf_distance_nt.mean().reset_index()
plt.figure(figsize=(5, 3))
sns.histplot(data=df_groups,
hue="affinity change stratum", x="closest_leaf_distance_nt",
hue_order=("affinity loss", "affinity neutral", "affinity gain"),
multiple="stack",
bins=np.linspace(0, df.groupby(["GC", "affinity change stratum"]).closest_leaf_distance_nt.mean().max(), 30),
palette=palette,
)
plt.xlabel("average branch length to closest leaf")
plt.ylabel("number of GCs")
plt.tight_layout()
plt.savefig(f"{output_dir}/terrible-histogram_{time}.pdf")
plt.show()
diff_stat = df_groups.query("`affinity change stratum` == 'affinity loss'").set_index("GC").closest_leaf_distance_nt - df_groups.query("`affinity change stratum` == 'affinity gain'").set_index("GC").closest_leaf_distance_nt
from scipy.stats import ttest_1samp
p = ttest_1samp(diff_stat, 0).pvalue
sns.histplot(diff_stat, bins=15)
plt.xlabel("difference in average branch length to closest\nleaf between affinity loss and affinity gain nodes")
plt.ylabel("number of GCs")
plt.axvline(0, c="k", ls="--", lw=2)
plt.annotate(f"p={p:.2e}", (0.1, 0.9), xycoords="axes fraction", va="center", ha="center")
plt.savefig(f"{output_dir}/terrible-significance_{time}.pdf")
plt.show()
d15
d20
In [ ]:
Copied!