Code Snippets

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Python Pandas numpy matplotlib seaborn common From line 1 of scripts/plot-depths.py

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Python Pandas numpy matplotlib seaborn common From line 1 of scripts/plot-depths.py

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Python Pandas numpy matplotlib seaborn common From line 1 of scripts/plot-depths.py

Snakemake workflow: dna-seq-gatk-variant-calling (v2.1.1)

import sys
sys.stderr = open(snakemake.log[0], "w")

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Python Pandas numpy matplotlib seaborn common From line 1 of scripts/plot-depths.py

In this repository, we present the code for the analysis of study of the transcription's impact on Escherichia coli chromosome.

import bacchus.directional as bcd
import bacchus.insulation as bci
import bacchus.io as bcio
import cooler
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.collections as mc
import numpy as np
import os
import pandas as pd
import scipy.sparse as sp
import scipy.stats as st
import seaborn as sns
from random import choices

mat_file = snakemake.input.mat
rna_file = snakemake.input.rna
annotation_file = snakemake.input.annotation
cmap = snakemake.params.cmap
dpi = snakemake.params.dpi
text_file = str(snakemake.output.text_file)
full_mat_file = str(snakemake.output.full_mat)
zoom_1_file = str(snakemake.output.zoom1)
zoom_2_file = str(snakemake.output.zoom2)
full_mat_di_file = str(snakemake.output.full_mat_di)
zoom_1_di_file = str(snakemake.output.zoom1_di)
zoom_2_di_file = str(snakemake.output.zoom2_di)
out_files_mat = [full_mat_file, zoom_1_file, zoom_2_file]
out_files_mat_di = [full_mat_di_file, zoom_1_di_file, zoom_2_di_file]
bor_trans = str(snakemake.output.bor_trans)
trans_bor = str(snakemake.output.trans_bor)

# Create outdir if necessary.
os.makedirs(str(snakemake.params.outdir), exist_ok=True)


def write_stats(text_file, text, writing_type="a"):
    with open(text_file, writing_type) as out:
        out.write(f"{text}\n")


# Compute CIDs at 5kb resolution with 100kb window size.
M5000 = cooler.Cooler(f"{mat_file}::/resolutions/{5000}").matrix(
    balance=True, sparse=True
)[:]
di5000 = bcd.directional_index(M5000, 20)
l5000 = bcd.di_borders(di5000)
write_stats(
    text_file,
    f"Numbers of CIDs at 5kb with 100kb window size: {len(l5000)}",
    "w",
)

# Compute CIDs at 2kb resolution with 50kb window size.
M2000 = cooler.Cooler(f"{mat_file}::/resolutions/{2000}").matrix(
    balance=True, sparse=True
)[:]
di2000 = bcd.directional_index(M2000, 25)
l2000 = bcd.di_borders(di2000)
write_stats(
    text_file, f"Numbers of CIDs at 2kb with 50kb window size: {len(l2000)}"
)

# Compute CIDs at 1kb resolution using insulation score.
M1000 = cooler.Cooler(f"{mat_file}::/resolutions/{1000}").matrix(
    balance=True, sparse=True
)[:]
di1000 = bcd.directional_index(M1000, 20)
l1000 = bcd.di_borders(di1000)
write_stats(
    text_file, f"Numbers of CIDs at 1kb with 50kb window size: {len(l1000)}"
)

mat = cooler.Cooler(f"{mat_file}::/resolutions/{1000}").matrix(
    balance=True, sparse=False
)[:]
mat[np.isnan(mat)] = 0
final_borders_1000, lri_1000 = bci.get_insulation_score(
    mat, [10, 15, 20, 25, 30]
)
write_stats(
    text_file,
    f"{len(final_borders_1000)} CIDs have been detected with insulation score at 1kb resolution.",
)


rna, _ = bcio.extract_big_wig(rna_file, ztransform=False)

annotation = pd.DataFrame(
    columns=["type", "start", "end", "strand", "gene_name", "rpkm"]
)

n = 0
with open(annotation_file, "r") as file:
    for line in file:
        if line.startswith("#"):
            continue
        elif line.startswith(">"):
            break
        else:
            line = line.split("\t")
            if line[2] in ["gene", "tRNA", "rRNA"]:
                if line[2] == "gene":
                    name = line[8].split("Name=")[-1].split(";")[0]
                    annot = {
                        "type": line[2],
                        "start": int(line[3]),
                        "end": int(line[4]),
                        "strand": line[6],
                        "gene_name": name,
                        "rpkm": np.mean(rna[int(line[3]) : int(line[4])]),
                    }
                    annotation = annotation.append(annot, ignore_index=True)


###
# Print matrices with CIDs borders (Fig 1e).
###
starts = [0, 2500, 3000]
ends = [len(mat), 3000, 3500]
col = ["blue", "lime", "cyan"]

for i in range(3):
    out_file, start, end = out_files_mat[i], starts[i], ends[i]

    vmax = 99
    title = None
    fig, ax = plt.subplots(1, 1, figsize=(7, 7), dpi=dpi)

    # Axis values
    scaling_factor = 1.0
    axis = "kb"

    # No end values given.
    if end == 0:
        end = len(mat)

    # Display plots
    im = ax.imshow(
        mat[start:end, start:end],
        cmap=cmap,
        vmin=0,
        vmax=np.nanpercentile(mat, vmax),
        extent=(
            start * scaling_factor,
            end * scaling_factor,
            end * scaling_factor,
            start * scaling_factor,
        ),
    )

    # Legend
    ax.set_xlabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.set_ylabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.tick_params(axis="both", labelsize=16)

    # Title
    if title is not None:
        ax.set_title(title, size=18)

    # Colorbar
    cbar = plt.colorbar(im, shrink=0.33, anchor=(0, 0.5))
    cbar.ax.tick_params(labelsize=16)

    first = True
    for i in [x * 5 for x in l5000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='blue')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[0], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[0], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[0],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l2000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[1], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[1], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[1],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in final_borders_1000:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='magenta')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[2], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[2], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[2],
        lw=1,
        zorder=0.5,
    )

    # Savefig
    plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
    plt.close()


###
# Print full matrices with CIDs borders using DI only (Supp Fig 1f).
###

for i in range(3):
    out_file, start, end = out_files_mat_di[i], starts[i], ends[i]

    vmax = 99
    title = None
    fig, ax = plt.subplots(1, 1, figsize=(7, 7), dpi=dpi)

    # Axis values
    scaling_factor = 1.0
    axis = "kb"

    # No end values given.
    if end == 0:
        end = len(mat)

    # Display plots
    im = ax.imshow(
        mat[start:end, start:end],
        cmap=cmap,
        vmin=0,
        vmax=np.percentile(mat, vmax),
        extent=(
            start * scaling_factor,
            end * scaling_factor,
            end * scaling_factor,
            start * scaling_factor,
        ),
    )

    # Legend
    ax.set_xlabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.set_ylabel(f"Genomic coordinates ({axis:s})", fontsize=16)
    ax.tick_params(axis="both", labelsize=16)

    # Title
    if title is not None:
        ax.set_title(title, size=18)

    # Colorbar
    cbar = plt.colorbar(im, shrink=0.33, anchor=(0, 0.5))
    cbar.ax.tick_params(labelsize=16)

    first = True
    for i in [x * 5 for x in l5000]:
        if i in np.arange(start, end):
            #         plt.axvline(i, lw=1, ls='dashed', c='blue')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[0],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[0], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[0], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[0],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l2000]:
        if i in np.arange(start, end):
            # plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[1],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[1], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[1], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[1],
        lw=1,
        zorder=0.5,
    )

    first = True
    for i in [x * 5 for x in l1000]:
        if i in np.arange(start, end):
            # plt.axvline(i, lw=1, ls='dashed', c='cyan')
            if first:
                s = i
                first = False
                ax.axvline(
                    s,
                    ymin=(end - s) / (end - start),
                    ymax=1,
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
                ax.axhline(
                    s,
                    xmin=0,
                    xmax=1 - ((end - s) / (end - start)),
                    c=col[2],
                    lw=1,
                    zorder=0.5,
                )
            else:
                e = i
                ax.add_patch(
                    patches.Rectangle(
                        (s, s), e - s, e - s, edgecolor=col[2], fill=False
                    )
                )
                s = i
    ax.axvline(
        s, ymin=(end - s) / (end - start), ymax=0, c=col[2], lw=1, zorder=0.5
    )
    ax.axhline(
        s,
        xmin=1,
        xmax=1 - ((end - s) / (end - start)),
        c=col[2],
        lw=1,
        zorder=0.5,
    )

    # Savefig
    plt.savefig(out_file, dpi=dpi, bbox_inches="tight")
    plt.close()

###
# Transcription at borders.
###


def get_borders_transcriptions(annotation, borders, size, step):
    borders_transcriptions = []
    for i in annotation.index:
        if annotation.loc[i, "strand"] == "-":
            pos = annotation.loc[i, "end"]
        else:
            pos = annotation.loc[i, "start"]
        for j in borders:
            a = j * size + step
            if abs(pos - a) < 2500:
                borders_transcriptions.append(annotation.loc[i, "rpkm"])
    return borders_transcriptions


borders_trans_5000 = get_borders_transcriptions(annotation, l5000, 5000, 2500)
borders_trans_2000 = get_borders_transcriptions(annotation, l2000, 5000, 1000)
borders_trans_1000 = get_borders_transcriptions(
    annotation, final_borders_1000, 1000, 500
)
data = {
    "RPKM (log)": np.log(
        list(annotation.rpkm)
        + list(np.log(borders_trans_5000))
        + list(np.log(borders_trans_2000))
        + list(np.log(borders_trans_1000))
    ),
    "Genes": list(np.repeat(f"All genes\nn={len(annotation)}", len(annotation)))
    + list(
        np.repeat(f"5kb\nn={len(borders_trans_5000)}", len(borders_trans_5000))
    )
    + list(
        np.repeat(f"2kb\nn={len(borders_trans_2000)}", len(borders_trans_2000))
    )
    + list(
        np.repeat(f"1kb\nn={len(borders_trans_1000)}", len(borders_trans_1000))
    ),
}
data = pd.DataFrame(data)
data.replace([np.inf, -np.inf, np.nan], 0, inplace=True)

sns.violinplot(x="Genes", y="RPKM (log)", data=data, palette="tab10")
plt.xlabel("Resolution of the matrix to detect the borders", size=14)
plt.ylabel("RPKM (log)", size=14)
plt.title(
    "Genes transcription of genes at less\nthan 5kb of a detected border",
    size=16,
)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(bor_trans, bbox_inches="tight")
plt.close()

write_stats(
    text_file,
    f"p-value between 5kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_5000))[1]}",
)
write_stats(
    text_file,
    f"p-value between 2kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_2000))[1]}",
)
write_stats(
    text_file,
    f"p-value between 1kb borders and whole genome: {st.mannwhitneyu(np.log(list(annotation.rpkm)), np.log(borders_trans_1000))[1]}",
)

##
# Transcription depending on distance borders
###

annotation["dist_5kb"] = 0
annotation["dist_2kb"] = 0
annotation["dist_1kb"] = 0
annotation["log1p_rpkm"] = np.log1p(annotation["rpkm"])
for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in l5000:
            pos = annotation.loc[i, "start"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 1) * 5000:
                a = pos - (x + 1) * 5000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in l5000:
            pos = annotation.loc[i, "end"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 1) * 5000:
                a = pos - (x + 1) * 5000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_5kb"] = value

for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in l2000:
            pos = annotation.loc[i, "start"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 0.4) * 5000:
                a = pos - (x + 0.4) * 5000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in l2000:
            pos = annotation.loc[i, "end"]
            if pos < x * 5000:
                a = x * 5000 - pos
            elif pos > (x + 0.4) * 5000:
                a = pos - (x + 0.4) * 5000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_2kb"] = value

for i in annotation.index:
    value = np.inf
    if annotation.loc[i, "strand"] == "+":
        for x in final_borders_1000:
            pos = annotation.loc[i, "start"]
            if pos < x * 1000:
                a = x * 1000 - pos
            elif pos > (x + 1) * 1000:
                a = pos - (x + 1) * 1000
            else:
                a = -1
            value = min(a, value)
    else:
        for x in final_borders_1000:
            pos = annotation.loc[i, "end"]
            if pos < x * 1000:
                a = x * 1000 - pos
            elif pos > (x + 1) * 1000:
                a = pos - (x + 1) * 1000
            else:
                a = -1
            value = min(a, value)
    annotation.loc[i, "dist_1kb"] = value

bin_means_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)
bin_means_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)
bin_means_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic="mean",
    bins=41,
    range=(-5000, 200000),
)


def bootstrap_down(data):
    l = np.zeros((10000))
    for i in range(10000):
        l[i] = np.mean(choices(data, k=len(data)))
    return np.percentile(l, 5)


def bootstrap_up(data):
    l = np.zeros((10000))
    for i in range(10000):
        l[i] = np.mean(choices(data, k=len(data)))
    return np.percentile(l, 95)


bin_perc5_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l1, bin_edges_l1, binnumber_l1 = st.binned_statistic(
    annotation.dist_5kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)
bin_perc5_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l2, bin_edges_l2, binnumber_l2 = st.binned_statistic(
    annotation.dist_2kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)
bin_perc5_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic=bootstrap_down,
    bins=41,
    range=(-5000, 200000),
)
bin_perc95_l3, bin_edges_l3, binnumber_l3 = st.binned_statistic(
    annotation.dist_1kb,
    annotation.rpkm,
    statistic=bootstrap_up,
    bins=41,
    range=(-5000, 200000),
)

plt.plot(bin_edges_l1[1:] / 1000, bin_means_l1, label="5kb")
plt.fill_between(
    bin_edges_l1[1:] / 1000, bin_perc5_l1, bin_perc95_l1, alpha=0.5
)
plt.plot(bin_edges_l2[1:] / 1000, bin_means_l2, label="2kb")
plt.fill_between(
    bin_edges_l2[1:] / 1000, bin_perc5_l2, bin_perc95_l2, alpha=0.5
)
plt.plot(bin_edges_l3[1:] / 1000, bin_means_l3, label="1kb")
plt.fill_between(
    bin_edges_l3[1:] / 1000, bin_perc5_l3, bin_perc95_l3, alpha=0.5
)
plt.xlabel("Distance from the closest border (kb)", size=14)
plt.ylabel("Mean RPKM", size=14)
plt.xlim(0, 50)
plt.title(
    "Genes transcription depending on\nthe distance to the closest border",
    size=16,
)
plt.xticks(size=12)
plt.yticks(size=12)
plt.legend()
plt.savefig(trans_bor, bbox_inches="tight")
plt.close()

Python Pandas numpy matplotlib seaborn scipy cooler cooler bacchus From line 4 of scripts/CIDs_analysis_resolution.py

import bacchus.hic as bch
import bacchus.io as bcio
import bacchus.plot as bcp
import cooler
import copy
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np
import os
from os.path import join
import pandas as pd
from scipy import ndimage
import seaborn as sns

# Import snakemake parameters.
annotation_file = snakemake.input.annotation
mat_wt_file = str(snakemake.input.cool_wt)
mat_rf_file = str(snakemake.input.cool_rf)
rna_wt_file = snakemake.input.rna_wt
rna_rf_file = snakemake.input.rna_rf
cov_wt_file = snakemake.input.cov_wt
cov_rf_file = snakemake.input.cov_rf
gc_file = snakemake.input.gc
epod_file = snakemake.input.EPODs
res = int(snakemake.params.res)
cmap = snakemake.params.cmap
outdir = str(snakemake.params.outdir)
width = snakemake.params.width
pos = snakemake.params.positions

# Create outdir if necessary.
os.makedirs(outdir, exist_ok=True)


def import_annotation_gff(annotation_file):
    """Function to create a table of the gene positions from the gff file."""
    annotation = pd.DataFrame(
        columns=["type", "start", "end", "strand", "name"]
    )
    with open(annotation_file, "r") as file:
        for line in file:
            # Header.
            if line.startswith("#"):
                continue
            # Stop at the fasta sequences.
            elif line.startswith(">"):
                break
            else:
                line = line.split("\t")
                if line[2] in ["gene", "tRNA", "rRNA"]:
                    if line[2] == "gene":
                        name = line[8].split("Name=")[-1].split(";")[0]
                        # Extract gene position.
                        annot = {
                            "type": line[2],
                            "start": int(line[3]),
                            "end": int(line[4]),
                            "strand": line[6],
                            "gene_name": name,
                        }
                        annotation = annotation.append(annot, ignore_index=True)
    return annotation


# Import files.
annotation = import_annotation_gff(annotation_file)
mat_wt = cooler.Cooler(f"{mat_wt_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_wt[np.isnan(mat_wt)] = 0
mat_rf = cooler.Cooler(f"{mat_rf_file}::/resolutions/{res}").matrix(
    balance=True, sparse=False
)[:]
mat_rf[np.isnan(mat_rf)] = 0
rna_wt, _ = bcio.extract_big_wig(rna_wt_file, binning=100)
rna_rf, _ = bcio.extract_big_wig(rna_rf_file, binning=100)
cov_wt, _ = bcio.extract_big_wig(cov_wt_file, binning=1000)
cov_rf, _ = bcio.extract_big_wig(cov_wt_file, binning=1000)
gc_content = pd.read_csv(gc_file, sep="\t", header=None).iloc[:, 1]
epod = pd.read_csv(epod_file, sep="\t", header=None)

# Make the rotation of the matrix to plot them
mat_wt_rot = copy.copy(mat_wt)
mat_wt_rot = bch.interpolate_white_lines(mat_wt_rot)
mat_wt_rot[np.isnan(mat_wt_rot)] = 0
mat_wt_rot = ndimage.rotate(mat_wt_rot, 45, reshape=True)
mat_rf_rot = copy.copy(mat_rf)
mat_rf_rot = bch.interpolate_white_lines(mat_rf_rot)
mat_rf_rot[np.isnan(mat_rf_rot)] = 0
mat_rf_rot = ndimage.rotate(mat_rf_rot, 45, reshape=True)


def plot_region(
    M1,
    M1_rot,
    M2,
    M2_rot,
    rna1,
    rna2,
    cov1,
    cov2,
    annotation,
    binning,
    width,
    zoom_ini,
    outfile,
    split=False,
):
    rna_binning = 100
    pal = sns.color_palette("Paired")

    # Defined values to specify the borders of the matrices and tables
    zoom = [zoom // binning for zoom in zoom_ini]
    width = width / 1000
    row1 = len(M1_rot) // 2 - int(np.sqrt(2) * width)
    row2 = len(M1_rot) // 2 + int(np.sqrt(2) * width)
    col1 = int(zoom[0] * np.sqrt(2))
    col2 = int(zoom[1] * np.sqrt(2))
    annotation_zoom = annotation[
        np.logical_and(
            annotation.start > zoom_ini[0], annotation.end < zoom_ini[1]
        )
    ]
    ymax = np.nanmax(
        np.concatenate(
            (
                rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
                rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            )
        )
    )
    max_cov = np.nanmax(cov1) * 1.02
    min_cov = np.nanmin(cov1) * 0.98

    # Define panels depending on split or not.
    if split and ymax > 1500:
        a = 4
        fig, ax = plt.subplots(
            7,
            2,
            figsize=(16, 12),
            gridspec_kw={"height_ratios": [1, 30, 1, 6, 12, 3, 3]},
            sharex=False,
        )

        # Parameters to merge panel 2 and 3.
        d = 0.02
        D = 0.04
        for j in range(2):
            ax[a - 1, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
            ax[a - 1, j].tick_params(axis="both", labelsize=15)
            ax[a - 1, j].spines["bottom"].set_visible(False)
            ax[a - 1, j].spines["top"].set_visible(False)
            ax[a - 1, j].spines["right"].set_visible(False)
            ax[a - 1, j].set_ylim(ymax - 525, ymax)
            ax[a - 1, j].get_xaxis().set_visible(False)

            # Add the small cut between the two panels
            kwargs = dict(
                transform=ax[a - 1, j].transAxes, color="k", clip_on=False
            )
            ax[a - 1, j].plot((-d, +d), (-D, +D), **kwargs)  # top-left diagonal
            kwargs.update(
                transform=ax[a, j].transAxes
            )  # switch to the bottom axes
            ax[a, j].plot(
                (-d, +d), (1 - d, 1 + d), **kwargs
            )  # bottom-left diagonal

        # Plot the RNA on the split panel
        ax[a - 1, 0].fill_between(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000
            ),
            rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            color="black",
        )
        ax[a - 1, 1].fill_between(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000
            ),
            rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
            color="black",
        )

    else:
        a = 3
        fig, ax = plt.subplots(
            6,
            2,
            figsize=(16, 12),
            gridspec_kw={"height_ratios": [1, 30, 1, 15, 3, 3]},
            sharex=False,
        )

    # Plot expressed genes - blue are forward - red are reversed
    for j in range(2):
        ax[0, j].set_xlim(zoom_ini[0], zoom_ini[1])
        ax[0, j].get_xaxis().set_visible(False)
        ax[0, j].get_yaxis().set_visible(False)
        ax[2, j].set_xlim(zoom_ini[0], zoom_ini[1])
        ax[2, j].get_xaxis().set_visible(False)
        ax[2, j].get_yaxis().set_visible(False)
    pos = 1
    for i in range(len(annotation_zoom)):
        # Extract annotation information
        annot = annotation_zoom.iloc[
            i,
        ]
        strand = annot.strand
        start = annot.start // rna_binning
        end = annot.end // rna_binning
        name = annot.gene_name
        # Defined color depending on the strand
        if strand == "+":
            color = pal.as_hex()[1]
        else:
            color = pal.as_hex()[5]
        # Print it only if it transcribed (10% most transcribed genes)
        if np.mean(rna1[start:end]) >= 120.7628998139441:
            pos = pos * -1
            for j in range(2):
                ax[0, j].add_patch(
                    patches.Rectangle(
                        (start * 100, 0),
                        (end - start) * 100,
                        1,
                        edgecolor=color,
                        facecolor=color,
                        fill=True,
                    )
                )
                ax[0, j].text(
                    x=(start + ((end - start) / 2)) * 100,
                    y=pos + 0.5,
                    s=name,
                    rotation=90 * pos,
                    wrap=True,
                )
    color = pal.as_hex()[3]
    for i in epod.index:
        start = epod.loc[i, 3]
        end = epod.loc[i, 4]
        if (
            (start > zoom_ini[0])
            and (start < zoom_ini[1])
            or (end > zoom_ini[0])
            and (end < zoom_ini[1])
        ):
            start = max(start, zoom_ini[0]) // rna_binning
            end = min(end, zoom_ini[1]) // rna_binning
            for j in range(2):
                ax[2, j].add_patch(
                    patches.Rectangle(
                        (start * 100, 0),
                        (end - start) * 100,
                        1,
                        edgecolor=color,
                        facecolor=color,
                        fill=True,
                    )
                )
    #                 ax[2, j].text(
    #                     x=(start + ((end - start) / 2)) * 100,
    #                     y=pos + 0.5,
    #                     s="EPOD",
    #                     rotation=90 * pos,
    #                     wrap=True,
    #                 )

    # Plot the matrices
    ax[1, 0].set_ylabel("Genomic distance (kb)", fontsize=15)
    for j in range(2):
        ax[1, j].get_xaxis().set_visible(False)
        ax[1, j].tick_params(axis="both", labelsize=15)

    # Plot the matrice 1
    im = ax[1, 0].imshow(
        M1_rot[row1:row2, col1:col2],
        cmap=cmap,
        interpolation="none",
        vmin=0,
        vmax=np.nanpercentile(M1[zoom[0] : zoom[1], zoom[0] : zoom[1]], 97),
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width, -width),
    )

    # Plot the matrice 2
    ax[1, 1].imshow(
        M2_rot[row1:row2, col1:col2],
        cmap=cmap,
        interpolation="none",
        vmin=0,
        vmax=np.nanpercentile(M1[zoom[0] : zoom[1], zoom[0] : zoom[1]], 97),
        extent=(col1 / np.sqrt(2), col2 / np.sqrt(2), width, -width),
    )

    # RNAseq legends and plot at the bottom panel.
    ax[a, 0].set_ylabel("RNA count (CPM)", fontsize=15)
    for j in range(2):
        #         ax[a, j].set_xlabel("Genomic coordinates (kb)", size=15)
        ax[a, j].get_xaxis().set_visible(False)
        ax[a, j].tick_params(axis="both", labelsize=15)
        ax[a, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a, j].spines["top"].set_visible(False)
        ax[a, j].spines["right"].set_visible(False)
        if split:
            if ymax > 1500:
                ax[a, j].set_ylim(0, 1050)
            else:
                ax[a, j].set_ylim(0, 1500)
        else:
            ax[a, j].set_ylim(0, ymax)

    # Plot RNA at the bottom panel
    ax[a, 0].fill_between(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000),
        rna1[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
        color="black",
    )
    ax[a, 1].fill_between(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, rna_binning / 1000),
        rna2[zoom_ini[0] // rna_binning : zoom_ini[1] // rna_binning],
        color="black",
    )

    # GC content
    gc_binning = 100
    for j in range(2):
        ax[a + 1, j].plot(
            np.arange(
                zoom_ini[0] // 1000, zoom_ini[1] // 1000, gc_binning / 1000
            ),
            gc_content[
                (zoom_ini[0] + 250)
                // gc_binning : (zoom_ini[1] + 250)
                // gc_binning
            ],
            color="black",
        )
        ax[a + 1, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a + 1, j].tick_params(axis="both", labelsize=15)
        ax[a + 1, j].get_xaxis().set_visible(False)
    ax[a + 1, 0].set_ylabel("GC", fontsize=15)

    # Coverage
    cov_binning = 1000
    ax[a + 2, 0].plot(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, cov_binning / 1000),
        cov1[
            (zoom_ini[0] + 250)
            // cov_binning : (zoom_ini[1] + 250)
            // cov_binning
        ],
        color="black",
    )
    ax[a + 2, 1].plot(
        np.arange(zoom_ini[0] // 1000, zoom_ini[1] // 1000, cov_binning / 1000),
        cov2[
            (zoom_ini[0] + 250)
            // cov_binning : (zoom_ini[1] + 250)
            // cov_binning
        ],
        color="black",
    )
    for j in range(2):
        ax[a + 2, j].set_xlim(zoom_ini[0] // 1000, zoom_ini[1] // 1000)
        ax[a + 2, j].set_ylim(min_cov, max_cov)
        ax[a + 2, j].set_xlabel("Genomic coordinates (kb)", size=15)
        ax[a + 2, j].tick_params(axis="both", labelsize=15)
    ax[a + 2, 0].set_ylabel("HiC\ncoverage\n(CPM)", fontsize=15)

    # Colorbar
    cbar = fig.colorbar(
        im, ax=ax.ravel().tolist(), shrink=0.3, anchor=(1.2, 0.75)
    )
    cbar.ax.tick_params(labelsize=15)

    # Save the fig and adjust margins
    if split and ymax > 1500:
        plt.subplots_adjust(wspace=0.2, hspace=0.1)
    else:
        plt.subplots_adjust(wspace=0.2, hspace=0.1)
    plt.savefig(outfile, bbox_inches="tight", dpi=200)


for split in [True, False]:
    if split:
        outfile = join(
            outdir,
            f"region_{pos[0]}_{pos[1]}_split.pdf",
        )
    else:
        outfile = join(
            outdir,
            f"region_{pos[0]}_{pos[1]}.pdf",
        )
    position = [int(p) * 1000 for p in pos]
    plot_region(
        mat_wt,
        mat_wt_rot,
        mat_rf,
        mat_rf_rot,
        rna_wt,
        rna_rf,
        cov_wt,
        cov_rf,
        annotation,
        res,
        width,
        position,
        outfile,
        split=split,
    )

Python Snakemake Pandas numpy matplotlib seaborn scipy cooler cooler bacchus From line 4 of scripts/plot_zoom.py

import os
import bacchus.blob as bcb
import bacchus.hic as bch
import bacchus.io as bcio
import numpy as np
import copy
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import scipy
import scipy.stats as st
import seaborn as sns
import cooler

mat_file = snakemake.input.mat
rna_file = snakemake.input.rna
cov_file = snakemake.input.cov
gc_file = snakemake.input.gc
frags_file = snakemake.input.frags
epod_file = snakemake.input.EPODs
out_bed = str(snakemake.output.bed)
text_file = str(snakemake.output.text_file)
gc_plot = str(snakemake.output.gc)
cov_plot = str(snakemake.output.cov)
RS_plot = str(snakemake.output.RS)
rna_plot = str(snakemake.output.rna)
dist_bar = str(snakemake.output.dist_bar)
dist_line = str(snakemake.output.dist_line)
venn_plot = str(snakemake.output.venn_plot)

# Create outdir if necessary.
os.makedirs(str(snakemake.params.outdir), exist_ok=True)


def write_stats(text_file, text, writing_type="a"):
    with open(text_file, writing_type) as out:
        out.write(f"{text}\n")


# Import data
mat = cooler.Cooler(f"{mat_file}::/resolutions/500").matrix(balance=True)[:]
mat[np.isnan(mat)] = 0
rna, _ = bcio.extract_big_wig(rna_file, ztransform=False)
rna500, _ = bcio.extract_big_wig(rna_file, binning=500, ztransform=False)
rna500 = np.log10(rna500[:-2] + 1e-10)
cov, _ = bcio.extract_big_wig(cov_file, binning=500)
cov = cov[:-2]
gc_content = pd.read_csv(gc_file, sep="\t", header=None).iloc[:, 1]
frags = pd.read_csv(frags_file, sep="\t")
epod_data = pd.read_csv(epod_file, sep="\t", header=None)
mask = bch.mask_white_line(mat)
mat[mask] = np.nan
mat[:, mask] = np.nan

# Save TIDs positions.
blobs, blob_score = bcb.find_blobs(
    mat, size=5, n_mads=10, refine=1 / 3, rna=rna
)
with open(out_bed, "w") as out:
    for blob in blobs:
        out.write(f"blob\t{blob.start * 500}\t{blob.end * 500}\n")

# Save stats on TIDs.
write_stats(
    text_file, f"Numbers of blobs without interpolation: {len(blobs)}", "w"
)
write_stats(
    text_file,
    f"Cumulative size of the blobs without interpolation: {np.sum([x.size for x in blobs]) * 0.5}kb",
)

# Mask bins within a TIDs or not.
blob_mask = np.repeat(False, len(mat))
for i in blobs:
    for j in range(i.start, i.end):
        blob_mask[j] = True
blob_mask_r = np.repeat(False, len(mat))
for i, v in enumerate(blob_mask):
    if v:
        blob_mask_r[i] = False
    else:
        blob_mask_r[i] = True

# GC content
gc_content = np.array(gc_content[np.arange(0, len(gc_content), 5)])
gc_blob = gc_content[blob_mask]
gc_other = gc_content[blob_mask_r]
data = {"GC": gc_content, "blobs": blob_mask}
data = pd.DataFrame(data)
sns.violinplot(x="blobs", y="GC", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("GC content %", size=14)
plt.title("GC content distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(gc_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob GC content: {np.mean(gc_blob)}")
write_stats(text_file, f"Other GC content: {np.mean(gc_other)}")
write_stats(text_file, f"Global GC content: {np.mean(gc_content)}")
write_stats(text_file, f"T-test blob-other: {st.ttest_ind(gc_blob, gc_other)}")
write_stats(text_file, f"T-test blob-all: {st.ttest_ind(gc_blob, gc_content)}")

# Restriction sites
start_pos = frags.start_pos[1:]
frags = np.zeros(len(gc_content))
for i in start_pos:
    frags[i // 500] += 1
data["frags"] = frags

sns.boxplot(x="blobs", y="frags", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("Restriction site", size=14)
plt.title("Restriction sites distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(RS_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob RS: {np.mean(frags[blob_mask])}")
write_stats(text_file, f"Other RS: {np.mean(frags[blob_mask_r])}")
write_stats(text_file, f"Global RS: {np.mean(frags)}")
write_stats(
    text_file,
    f"T-test blob-other: {st.ttest_ind(frags[blob_mask], frags[blob_mask_r])}",
)
write_stats(
    text_file, f"T-test blob-all: {st.ttest_ind(frags[blob_mask], frags)}"
)

# Coverage
data["cov"] = cov

sns.violinplot(x="blobs", y="cov", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("HiC Coverage (CPM)", size=14)
plt.title("Coverage distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.savefig(cov_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob coverage: {np.mean(cov[blob_mask])}")
write_stats(text_file, f"Other coverage: {np.mean(cov[blob_mask_r])}")
write_stats(text_file, f"Global coverage: {np.mean(cov)}")
write_stats(
    text_file,
    f"T-test blob-other: {st.ttest_ind(cov[blob_mask], cov[blob_mask_r])}",
)
write_stats(text_file, f"T-test blob-all: {st.ttest_ind(cov[blob_mask], cov)}")

# Transcription
data["rna"] = rna500

sns.violinplot(x="blobs", y="rna", data=data, palette="tab10")
plt.xlabel("Blobs", size=14)
plt.ylabel("Transcription (CPM)", size=14)
plt.title("Transcription distribution in blobs", size=16)
plt.xticks(size=12)
plt.yticks(size=12)
plt.ylim(-8, 6)
plt.savefig(rna_plot, bbox_inches="tight")
plt.close()

write_stats(text_file, f"Blob transcription: {np.mean(rna500[blob_mask])}")
write_stats(text_file, f"Other transcription: {np.mean(rna500[blob_mask_r])}")
write_stats(text_file, f"Global transcription: {np.mean(rna500)}")
write_stats(
    text_file,
    f"T-test blob-other: {st.ttest_ind(rna500[blob_mask], rna500[blob_mask_r])}",
)
write_stats(
    text_file, f"T-test blob-all: {st.ttest_ind(rna500[blob_mask], rna500)}"
)

# TIDs distribution

tid_data = pd.read_csv(out_bed, sep="\t", header=None)
ori = (3925744 + 3925975) // 2
ter = 1590754
macrodomain = [x * 5000 for x in [45, 231, 393, 546, 685, 842]]
x = np.arange(0, 4_641_652, 500)
y = np.zeros(len(x))
for i in tid_data.index:
    start = tid_data.loc[i, 1]
    end = tid_data.loc[i, 2]
    for k in np.arange(start, end, 500):
        y[k // 500] = 1

# Bar plot
fig, ax = plt.subplots(figsize=(20, 5))
ax.fill_between(x / 1_000_000, y, color="k", alpha=1)
plt.xticks(size=16)
plt.xlabel("Genomics coordinates (Mb)", size=18)
ax.yaxis.set_visible(False)
ax.set_title("TIDs distribution", size=20)
for i in macrodomain[:-1]:
    plt.axvline(
        x=i / 1_000_000, c="green", linestyle="dashed", alpha=0.5, linewidth=3
    )
plt.axvline(
    x=macrodomain[-1] / 1_000_000,
    c="green",
    linestyle="dashed",
    alpha=0.5,
    label="macrodomain",
    linewidth=3,
)
plt.axvline(
    x=ori / 1_000_000,
    c="r",
    linestyle="dashed",
    alpha=0.5,
    label="ori",
    linewidth=3,
)
plt.axvline(
    x=ter / 1_000_000,
    c="blue",
    linestyle="dashed",
    alpha=0.5,
    label="ter",
    linewidth=3,
)
plt.legend()
plt.savefig(dist_bar)
plt.close()

# Line plot
bins = np.arange(0, 4_641_652, 50_000)
hist, x_bins = np.histogram(x, bins, weights=y)
fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(x_bins[:-1] / 1_000_000, hist * 1, color="k")
ax.set_xlabel("Genomic coordinates (Mb)", size=16)
ax.set_ylabel("Ratio of TID in a 10kb bin (%)", size=16)
ax.tick_params(labelsize=16)
ax.set_title("TIDs distribution", size=20)
ax.set_xlim(0, 4_641_652 / 1_000_000)
for i in macrodomain[:-1]:
    plt.axvline(x=i / 1_000_000, c="k", linestyle="dashed", alpha=0.5)
plt.axvline(
    x=macrodomain[-1] / 1_000_000,
    c="k",
    linestyle="dashed",
    alpha=0.5,
    label="macrodomain",
)
plt.axvline(
    x=ori / 1_000_000, c="r", linestyle="dashed", alpha=0.5, label="ori"
)
plt.axvline(
    x=ter / 1_000_000, c="blue", linestyle="dashed", alpha=0.5, label="ter"
)
plt.legend()
plt.savefig(dist_line)
plt.close()

# Venn Diagramm

x = np.arange(0, 4_641_652, 5)
y_tid = np.zeros(len(x))
y_epod = np.zeros(len(x))
for i in tid_data.index:
    start = tid_data.loc[i, 1]
    end = tid_data.loc[i, 2]
    for k in np.arange(start, end, 5):
        y_tid[k // 5] = 1
for i in epod_data.index:
    start = epod_data.loc[i, 3]
    end = epod_data.loc[i, 4]
    for k in np.arange(start, end, 5):
        y_epod[k // 5] = 1
epod_only = 0
tid_only = 0
both = 0
none = 0
for i in x:
    k = i // 5
    if (y_epod[k] == 1) & (y_tid[k] == 1):
        both += 5
    elif (y_epod[k] == 0) & (y_tid[k] == 1):
        tid_only += 5
    elif (y_epod[k] == 1) & (y_tid[k] == 0):
        epod_only += 5
    elif (y_epod[k] == 0) & (y_tid[k] == 0):
        none += 5

venn2(
    subsets=(epod_only // 1000, tid_only // 1000, both // 1000),
    set_labels=("EPODs", "TIDs"),
)
plt.savefig(venn_plot)
plt.close()

Python Pandas numpy matplotlib seaborn scipy cooler cooler bacchus matplotlib-venn From line 4 of scripts/TIDs_analysis.py

MAGqual is a command line tool to evaluate the quality of metagenomic bins and generate recommended metadata in line with the MIMAG standards (v0.1.1-beta)

import pandas as pd
import glob
import argparse
import os
from shutil import copyfile
import seaborn as sns
import matplotlib.pyplot as plt

def qual_cluster(comp, cont):
    if (comp >90) and (cont<5):
        qual = "High Quality"
    elif (comp >= 50) and (cont <10):
        qual = "Medium Quality"
    elif (comp <50) and (cont <10):
        qual = "Low Quality"
    else:
        qual = "Failed"
    return qual

def gen_qual(comp, cont):
    if (comp - (cont*5)) >= 50:
        genome = "y"
    else:
        genome = "n"
    return genome

def bsearch(bakta_loc, cluster): 
    length = 0
    loc = str(bakta_loc + str(cluster) + '/' + str(cluster) + '.txt') #don't copy this bit change it !
    # loc = str(str(cluster) + '.txt') #don't copy this bit change it !
    for name in glob.glob(loc):
        bakta_file = name
        with open(bakta_file, 'r') as bakta_log:
            for line in bakta_log:
                if "Length:" in line:
                    length = int(line.split(":")[1])
                if "Count:" in line: 
                    count = int(line.split(":")[1])
                if "N50:" in line:
                    N50 = int(line.split(":")[1])
                if "Software:" in line: 
                    software = line.split(":")[1].strip()
                if "Database" in line:
                    db = line.split(":")[1].strip()
    bakta_v = "Bakta " + software + " DB " + db 
    row_dict = {"length" : length, "contigs" : count, "N50": N50, "bakta_v" : bakta_v}
    return pd.Series(row_dict)

parser = argparse.ArgumentParser(description='')
parser.add_argument('output', help='output directory for the organised bins', type=str)
parser.add_argument('checkm_log', help='checkm output log file (TAB SEPARATED', type=str)
parser.add_argument('bakta_loc', help='directory containing all bakta output for all clusters', type=str)
parser.add_argument('seqkit_log', help='file containing the seqkit output for all clusters', type=str)
parser.add_argument('bin_loc', help='directory containing fasta files for all clusters', type=str)
parser.add_argument('jobid', help='prefix for current jobs', type=str)

args = parser.parse_args()
output = args.output
checkm_log = args.checkm_log
bakta_loc = args.bakta_loc
seqkit_log = args.seqkit_log
bin_loc = args.bin_loc
job_id = args.jobid

comp_software = "CheckM v.1.0.13"
comp_approach = "Marker gene"

colour_dict = dict({'High Quality':'#50C5B7',
                  'Near Complete':'#9CEC5B',
                  'Medium Quality': '#F0F465',
                  'Low Quality': "#F8333C",
                  'Failed': '#646E78'})

# =============================================================================
# CHECKM PARSE
# =============================================================================
checkm_df = pd.read_csv(checkm_log, sep = "\t")
checkm_df['Quality'] = checkm_df.apply(lambda x: qual_cluster(x['Completeness'], x['Contamination']), axis=1)
checkm_df[['Size_bp', 'No_contigs', 'N50_length', '16S_Software']] = checkm_df.apply(lambda x: bsearch(bakta_loc, x["Bin Id"]), axis = 1)

checkm_df = checkm_df.set_index("Bin Id")

high_clusters = checkm_df[checkm_df['Quality'].str.contains("High Quality")].index.values.tolist()
med_qual_clusters = checkm_df[checkm_df['Quality'].str.contains("Medium Quality")].index.values.tolist()
low_qual_clusters = checkm_df[checkm_df['Quality'].str.contains("Low Quality")].index.values.tolist()
NA = checkm_df[checkm_df['Quality'].str.contains("Failed")].index.values.tolist()
all_clusters = high_clusters + med_qual_clusters + low_qual_clusters + NA

checkm_df = checkm_df.drop(checkm_df.columns[[1, 2, 3, 4, 5, 6, 7, 8, 9]], axis=1)

# =============================================================================
# SEQKIT PARSE
# =============================================================================
seqkit_df = pd.read_csv(seqkit_log, sep = "\t")
seqkit_df = seqkit_df[["file", "max_len"]]
seqkit_df["file"] = seqkit_df["file"].str.replace('.fasta','', regex=True)
seqkit_df["file"] = seqkit_df["file"].str.split("/").str[-1]
seqkit_df.set_index("file", inplace=True)
seqkit_df.rename(columns={"max_len":"Max_contig_length"}, inplace = True)
checkm_df = pd.merge(checkm_df, seqkit_df, left_index=True, right_index=True, how="left")

# =============================================================================
# BAKTA PARSE
# =============================================================================

high_qual_clusters= []
near_comp_clusters = []
r16s_comp_clusters = []
trna_num = {}
for cluster in all_clusters:
    loc = str(bakta_loc + cluster + '/' + cluster + '.tsv') #change this too 
    # loc = str(str(cluster) + '.tsv') #don't copy this bit change it !
    for name in glob.glob(loc):
        bakta_file = name
        with open(bakta_file, 'r') as bakta_in:
            trna_set = set()
            rna_set = set()
            rrna_16S = "N"
            for line in bakta_in:
                if "tRNA-Ala" in line:
                    trna_set.add("tRNA-Ala")
                if "tRNA-Arg" in line:
                    trna_set.add("tRNA-Arg")
                if "tRNA-Asn" in line:
                    trna_set.add("tRNA-Asn")
                if "tRNA-Asp" in line:
                    trna_set.add("tRNA-Asp")
                if "tRNA-Cys" in line:
                    trna_set.add("tRNA-Cys")
                if "tRNA-Gln" in line:
                    trna_set.add("tRNA-Gln")
                if "tRNA-Glu" in line:
                    trna_set.add("tRNA-Glu")
                if "tRNA-Gly" in line:
                    trna_set.add("tRNA-Gly")
                if "tRNA-His" in line:
                    trna_set.add("tRNA-His")
                if "tRNA-Ile" in line:
                    trna_set.add("tRNA-Ile")
                if "tRNA-Leu" in line:
                    trna_set.add("tRNA-Leu")
                if "tRNA-Lys" in line:
                    trna_set.add("tRNA-Lys")
                if "tRNA-Met" in line:
                    trna_set.add("tRNA-Met")
                if "tRNA-Phe" in line:
                    trna_set.add("tRNA-Phe")
                if "tRNA-Pro" in line:
                    trna_set.add("tRNA-Pro")
                if "tRNA-Ser" in line:
                    trna_set.add("tRNA-Ser")
                if "tRNA-Thr" in line:
                    trna_set.add("tRNA-Thr")
                if "tRNA-Trp" in line:
                    trna_set.add("tRNA-Trp")
                if "tRNA-Tyr" in line:
                    trna_set.add("tRNA-Tyr")
                if "tRNA-Val" in line:
                    trna_set.add("tRNA-Val")
                if ("5S ribosomal RNA" in line) and ("partial" not in line):
                    rna_set.add('5S')
                if ("16S ribosomal RNA" in line) and ("partial" not in line):
                    rna_set.add('16S')
                    rrna_16S = "Y"
                if ("23S ribosomal RNA" in line) and ("partial" not in line):
                    rna_set.add('23s')
        if rrna_16S == "Y":
            r16s_comp_clusters.append(cluster)
        if cluster in high_clusters:
            if (len(trna_set) >= 18) and (len(rna_set) == 3):
                high_qual_clusters.append(cluster)
            else:
                near_comp_clusters.append(cluster) # adds high qual that fail trna/rna
        trna_num.update({cluster : len(trna_set)})


# =============================================================================
# Add these new qualities to the checkm dataframe        
# =============================================================================

checkm_df.loc[checkm_df.index.isin(near_comp_clusters), "Quality"] = "Near Complete"

checkm_df.loc[checkm_df.index.isin(r16s_comp_clusters), "16S_Recovered"] = "Y"
checkm_df["16S_Recovered"] = checkm_df["16S_Recovered"].fillna("N")

# Sort out legend order
label_order = ["High Quality", "Near Complete", "Medium Quality", "Low Quality", "Failed"]
labels_all = checkm_df["Quality"].unique().tolist()

for item in label_order:
    if item not in labels_all:
        label_order.remove(item)

labels_list = label_order

# =============================================================================
# Basic plot
# =============================================================================

checkm_df["Purity"] = 100 - checkm_df["Contamination"]
plt.figure(figsize=(15, 10))
ax = sns.scatterplot(data = checkm_df, x="Completeness", y="Purity",
                hue='Quality', size = 'Size_bp', sizes=(20, 800), alpha = 0.6, 
                palette=colour_dict, hue_order= labels_list)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.xlabel('Completeness (%)', size = 24)
plt.ylabel('Purity (%)', size = 24)
plt.xlim(0)
plt.legend(prop={'size': 20})
plt.tight_layout()
plt.tick_params(labelsize=18)

plt.savefig(job_id + '_mag_qual.png')
# =============================================================================
# COPYING FILES INTO QUAL DIRECTORIES
# =============================================================================

location = bin_loc
new_loc = output + "/" + job_id + "/"
os.makedirs(new_loc + "high_qual", exist_ok=True)
os.makedirs(new_loc + "near_comp", exist_ok=True)
os.makedirs(new_loc + "med_qual", exist_ok=True)
os.makedirs(new_loc + "low_qual", exist_ok=True)
os.makedirs(new_loc + "failed", exist_ok=True)

for high in high_qual_clusters:
    file = location + high + ".fasta"
    copyfile(file, new_loc +"high_qual/"+high+".fasta")

for nc in near_comp_clusters:
    file = location + nc + ".fasta"
    copyfile(file, new_loc +"near_comp/"+nc+".fasta")

for med in med_qual_clusters:
    file = location + med + ".fasta"
    copyfile(file, new_loc+"med_qual/"+med+".fasta")

for low in low_qual_clusters:
    file = location + low + ".fasta"
    copyfile(file, new_loc+"low_qual/"+low+".fasta")

for NA_bin in NA:
    file = location + NA_bin + ".fasta"
    copyfile(file, new_loc+"failed/"+NA_bin+".fasta")

# =============================================================================
# OUTPUT CREATED HERE
# =============================================================================

magqual_df = checkm_df[["Quality", "Completeness", "Contamination", "16S_Recovered", "16S_Software", "Size_bp", "No_contigs", "N50_length", "Max_contig_length"]].copy()
magqual_df["tRNA_Extracted"] = pd.Series(trna_num)
magqual_df["tRNA_Software"] = magqual_df["16S_Software"]
magqual_df["Completeness_Approach"] = comp_approach
magqual_df["Completeness_Software"] = comp_software

magqual_df = magqual_df.reindex(columns=["Quality", "Completeness", "Contamination", "Completeness_Software","Completeness_Approach", "16S_Recovered", "16S_Software", "tRNA_Extracted", "tRNA_Software", "Size_bp", "No_contigs", "N50_length", "Max_contig_length"])

magqual_df.to_csv("analysis/" + job_id + "_mag_qual_statistics.csv")

print("-" * 12)
print(" NUMBER MAGs")
print("-" * 12)
print("High Quality:", len(high_qual_clusters))
print("Near Complete:", len(near_comp_clusters))
print("Med Quality:", len(med_qual_clusters))
print("Low Quality:", len(low_qual_clusters))
print("Failed:", len(NA), "\n")
print("-" * 12)
print(" MAG IDs")
print("-" * 12)
print("High Quality: " , ", ".join([str(x) for x in high_qual_clusters]))
print("Near Complete: ", ", ".join([str(x) for x in near_comp_clusters]))
print("Med Quality: ", ", ".join([str(x) for x in med_qual_clusters]))
print("Low Quality: ", ", ".join([str(x) for x in low_qual_clusters]))
print("Failed: ", ", ".join([str(x) for x in NA]))

Python Pandas matplotlib seaborn seqkit CheckM Bakta From line 11 of python/qual_parse.py

Snakemake pipelines for replicating sstar analysis

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.use("Agg")

import seaborn as sns
sns.set_style("darkgrid")

from sklearn import metrics


sstar_1src_accuracy = pd.read_csv(snakemake.input.sstar_1src_accuracy, sep="\t").dropna()
sprime_1src_accuracy = pd.read_csv(snakemake.input.sprime_1src_accuracy, sep="\t").dropna()
skovhmm_1src_accuracy = pd.read_csv(snakemake.input.skovhmm_1src_accuracy, sep="\t").dropna()

sstar_1src_accuracy_grouped = sstar_1src_accuracy.groupby(['demography', 'scenario', 'sample', 'cutoff'], as_index=False)
sprime_1src_accuracy_grouped = sprime_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False)
skovhmm_1src_accuracy_grouped = skovhmm_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False)

sstar_1src_accuracy_mean = sstar_1src_accuracy_grouped.mean()
sprime_1src_accuracy_mean = sprime_1src_accuracy_grouped.mean()
skovhmm_1src_accuracy_mean = skovhmm_1src_accuracy_grouped.mean()

sstar_1src_accuracy_mean.to_csv(snakemake.output.sstar_1src_accuracy_mean, sep="\t", index=False)
sprime_1src_accuracy_mean.to_csv(snakemake.output.sprime_1src_accuracy_mean, sep="\t", index=False)
skovhmm_1src_accuracy_mean.to_csv(snakemake.output.skovhmm_1src_accuracy_mean, sep="\t", index=False)

sprime_1src_accuracy_mean['scenario'] = ['true'] * len(sprime_1src_accuracy_mean)
skovhmm_1src_accuracy_mean['scenario'] = ['true'] * len(skovhmm_1src_accuracy_mean)

sstar_2src_accuracy = pd.read_csv(snakemake.input.sstar_2src_accuracy, sep="\t").dropna()
sprime_2src_accuracy = pd.read_csv(snakemake.input.sprime_2src_accuracy, sep="\t").dropna()
archaicseeker2_2src_accuracy = pd.read_csv(snakemake.input.archaicseeker2_2src_accuracy, sep="\t").dropna()

sstar_2src_accuracy_grouped = sstar_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)
sprime_2src_accuracy_grouped = sprime_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)
archaicseeker2_2src_accuracy_grouped = archaicseeker2_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)

sstar_2src_accuracy_mean = sstar_2src_accuracy_grouped.mean()
sprime_2src_accuracy_mean = sprime_2src_accuracy_grouped.mean()
archaicseeker2_2src_accuracy_mean = archaicseeker2_2src_accuracy_grouped.mean()

sstar_2src_accuracy_mean.to_csv(snakemake.output.sstar_2src_accuracy_mean, sep="\t", index=False)
sprime_2src_accuracy_mean.to_csv(snakemake.output.sprime_2src_accuracy_mean, sep="\t", index=False)
archaicseeker2_2src_accuracy_mean.to_csv(snakemake.output.archaicseeker2_2src_accuracy_mean, sep="\t", index=False)

methods1 = ['sstar', 'sprime', 'skovhmm']
demography1 = ['HumanNeanderthal', 'BonoboGhost']
samples = ['nref_10_ntgt_1', 'nref_50_ntgt_1']
scenarios = ['true', 'const', 'ref_tgt_only']
accuracy1 = {
    'sstar': sstar_1src_accuracy_mean,
    'sprime': sprime_1src_accuracy_mean,
    'skovhmm': skovhmm_1src_accuracy_mean,
}
methods2 = [
    'sstar', 
    'sprime', 
    'archaicseeker2'
]
demography2 = ['HumanNeanderthalDenisovan', 'ChimpBonoboGhost']
accuracy2 = {
    'sstar': sstar_2src_accuracy_mean,
    'sprime': sprime_2src_accuracy_mean,
    'archaicseeker2': archaicseeker2_2src_accuracy_mean,
}

fig, axs = plt.subplots(nrows=2, ncols=3, constrained_layout=True, figsize=(7.5,4), dpi=350)
gridspec = axs[0, 0].get_subplotspec().get_gridspec()
for a in axs[:,2]:
    a.remove()

markers = {
    'nref_10_ntgt_1': {'symbol':'.', 'size': 6},
    'nref_50_ntgt_1': {'symbol':'*', 'size': 6},
}

colors = {
    'sstar': {'true': 'blue', 'const': 'cyan', 'ref_tgt_only': 'purple'},
    'skovhmm': 'green',
    'sprime': 'orange',
    'archaicseeker2': 'magenta',
}

linestyles = {
    'const': 'dotted',
    'true': 'solid',
    'ref_tgt_only': (0, (3, 1, 1, 1, 1, 1)),
}

titles = {
    'HumanNeanderthal': 'Human-Neanderthal model',
    'BonoboGhost': 'Bonobo-Ghost model',
    'HumanNeanderthalDenisovan': 'Human-Neanderthal-Denisovan model',
    'ChimpBonoboGhost': 'Chimpanzee-Ghost-Bonobo model',
}

zorders = {
    'sstar': 2,
    'skovhmm': 5,
    'sprime': 10,
}

j = 0
for d in demography1:
    for s in samples:
        for sc in scenarios:
            for m in methods1:
                if m == 'sstar': color = colors[m][sc]
                else: color = colors[m]
                df = accuracy1[m][
                        (accuracy1[m]['demography'] == d) &
                        (accuracy1[m]['sample'] == s) &
                        (accuracy1[m]['scenario'] == sc)
                    ].sort_values(by='recall', ascending=False)
                recall = df['recall']
                precision = df['precision']
                if (m == 'sprime') or (m == 'skovhmm'):
                    if sc != 'true': continue

                if d == 'BonoboGhost':
                    axs[0,j].plot(recall, precision,
                        marker=markers[s]['symbol'], ms=markers[s]['size'],
                        c=color, zorder=zorders[m])
                else:
                    axs[0,j].plot(recall, precision,
                        marker=markers[s]['symbol'], ms=markers[s]['size'],
                        c=color)

    axs[0,j].set_xlabel('Recall (%)', fontsize=10)
    axs[0,j].set_ylabel('Precision (%)', fontsize=10)
    axs[0,j].set_xlim([-5, 105])
    axs[0,j].set_ylim([-5, 105])
    axs[0,j].set_title(titles[d], fontsize=8, weight='bold')
    if j == 0: 
        axs[0,j].text(-35, 110, 'B', fontsize=10, weight='bold')
        axs[0,j].plot([0,100],[2.25,2.25], c='red', alpha=0.5)
    if j == 1: 
        axs[0,j].text(-35, 110, 'C', fontsize=10, weight='bold')
        axs[0,j].plot([0,100],[2,2], c='red', alpha=0.5)

    f_scores = np.linspace(20, 80, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(1, 100)
        y = f_score * x / (2 * x - f_score)
        (l,) = axs[0,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1)
        axs[0,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8)

    j += 1

j = 0
for d in demography2:
    for s in samples:
        for m in methods2:
            if m == 'sstar': color = colors[m]['true']
            else: color = colors[m]
            src1_df = accuracy2[m][
                         (accuracy2[m]['demography'] == d) &
                         (accuracy2[m]['sample'] == s) &
                         (accuracy2[m]['src'] == 'src1')
                     ].sort_values(by='recall', ascending=False)
            src2_df = accuracy2[m][
                         (accuracy2[m]['demography'] == d) &
                         (accuracy2[m]['sample'] == s) &
                         (accuracy2[m]['src'] == 'src2')
                     ].sort_values(by='recall', ascending=False)
            src1_recall = src1_df['recall']
            src1_precision = src1_df['precision']
            src2_recall = src2_df['recall']
            src2_precision = src2_df['precision']
            axs[1,j].plot(src1_recall, src1_precision,
                     marker=markers[s]['symbol'], ms=markers[s]['size'],
                     c=color, markerfacecolor='white')
            axs[1,j].plot(src2_recall, src2_precision,
                     marker=markers[s]['symbol'], ms=markers[s]['size'],
                     c=color, linestyle='dashdot')

    axs[1,j].set_xlabel('Recall (%)', fontsize=10)
    axs[1,j].set_ylabel('Precision (%)', fontsize=10)
    axs[1,j].set_xlim([-5, 105])
    axs[1,j].set_ylim([-5, 105])
    axs[1,j].set_title(titles[d], fontsize=8, weight='bold')
    if j == 0: 
        axs[1,j].text(-35, 110, 'D', fontsize=10, weight='bold')
        axs[1,j].plot([0,100],[0.2,0.2], c='red', alpha=0.5)
        axs[1,j].plot([0,100],[4,4], c='red', linestyle='dotted')
    if j == 1: 
        axs[1,j].text(-35, 110, 'E', fontsize=10, weight='bold')
        axs[1,j].plot([0,100],[2,2], c='red', alpha=0.5)
        axs[1,j].plot([0,100],[2,2], c='red', linestyle='dotted')

    f_scores = np.linspace(20, 80, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(1, 100)
        y = f_score * x / (2 * x - f_score)
        (l,) = axs[1,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1)
        axs[1,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8)

    j += 1

# legend
subfig = fig.add_subfigure(gridspec[:,2])
handles, labels = subfig.gca().get_legend_handles_labels()
sstar_line = plt.Line2D([0], [0], label='sstar (full)', color=colors['sstar']['true'])
sstar_line2 = plt.Line2D([0], [0], label='sstar (constant)', color=colors['sstar']['const'])
sstar_line3 = plt.Line2D([0], [0], label='sstar (only ref & tgt)', color=colors['sstar']['ref_tgt_only'])
skovhmm_line = plt.Line2D([0], [0], label='SkovHMM', color=colors['skovhmm'])
sprime_line = plt.Line2D([0], [0], label='SPrime', color=colors['sprime'])
archaicseeker2_line = plt.Line2D([0], [0], label='ArchaicSeeker2.0', color=colors['archaicseeker2'])
baseline1 = plt.Line2D([0], [0], label='baseline/src1 baseline', color='red', alpha=0.5)
baseline2 = plt.Line2D([0], [0], label='src2 baseline', color='red', linestyle='dotted')
f1_curves = plt.Line2D([0], [0], label='iso-F1 curves', color='black', alpha=0.4, linestyle='dotted')
nref_10_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_10_ntgt_1']['symbol'],
                            ms=5, label='Nref = 10', color='black', linewidth=0)
nref_50_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_50_ntgt_1']['symbol'],
                            ms=5, label='Nref = 50', color='black', linewidth=0)
src1 = plt.Line2D([0], [0], label='src1', color='black', marker='o', ms=4, markerfacecolor='white')
src2 = plt.Line2D([0], [0], label='src2', color='black', marker='o', ms=4, linestyle='dotted')

handles.extend([sstar_line, sstar_line2, sstar_line3, skovhmm_line, sprime_line, archaicseeker2_line,
                baseline1, baseline2, f1_curves, nref_10_ntgt_1, nref_50_ntgt_1, src1, src2])
subfig.legend(handles=handles, fontsize=8, handlelength=1.5)

fig.set_constrained_layout_pads(w_pad=4 / 72, h_pad=4 / 72, hspace=0, wspace=0.1)
plt.savefig(snakemake.output.accuracy, bbox_inches='tight')

Python Pandas numpy matplotlib seaborn sklearn BiocSklearn From line 1 of workflows/plots.py

Snakemake workflow: dna-seq-gatk-variant-calling

import common
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

calls = pd.read_table(snakemake.input[0], header=[0, 1])
samples = [name for name in calls.columns.levels[0] if name != "VARIANT"]
sample_info = calls.loc[:, samples].stack([0, 1]).unstack().reset_index(1, drop=False)
sample_info = sample_info.rename({"level_1": "sample"}, axis=1)

sample_info = sample_info[sample_info["DP"] > 0]
sample_info["freq"] = sample_info["AD"] / sample_info["DP"]
sample_info.index = np.arange(sample_info.shape[0])

plt.figure()

sns.stripplot(x="sample", y="freq", data=sample_info, jitter=True)
plt.ylabel("allele frequency")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.freqs)

plt.figure()

sns.stripplot(x="sample", y="DP", data=sample_info, jitter=True)
plt.ylabel("read depth")
plt.xticks(rotation="vertical")

plt.savefig(snakemake.output.depths)

Python Pandas numpy matplotlib seaborn common From line 1 of scripts/plot-depths.py

Workflow Steps and Code Snippets

seaborn