Code Snippets

Snakemake pipelines for replicating sstar analysis

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
matplotlib.use("Agg")

import seaborn as sns
sns.set_style("darkgrid")

from sklearn import metrics


sstar_1src_accuracy = pd.read_csv(snakemake.input.sstar_1src_accuracy, sep="\t").dropna()
sprime_1src_accuracy = pd.read_csv(snakemake.input.sprime_1src_accuracy, sep="\t").dropna()
skovhmm_1src_accuracy = pd.read_csv(snakemake.input.skovhmm_1src_accuracy, sep="\t").dropna()

sstar_1src_accuracy_grouped = sstar_1src_accuracy.groupby(['demography', 'scenario', 'sample', 'cutoff'], as_index=False)
sprime_1src_accuracy_grouped = sprime_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False)
skovhmm_1src_accuracy_grouped = skovhmm_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False)

sstar_1src_accuracy_mean = sstar_1src_accuracy_grouped.mean()
sprime_1src_accuracy_mean = sprime_1src_accuracy_grouped.mean()
skovhmm_1src_accuracy_mean = skovhmm_1src_accuracy_grouped.mean()

sstar_1src_accuracy_mean.to_csv(snakemake.output.sstar_1src_accuracy_mean, sep="\t", index=False)
sprime_1src_accuracy_mean.to_csv(snakemake.output.sprime_1src_accuracy_mean, sep="\t", index=False)
skovhmm_1src_accuracy_mean.to_csv(snakemake.output.skovhmm_1src_accuracy_mean, sep="\t", index=False)

sprime_1src_accuracy_mean['scenario'] = ['true'] * len(sprime_1src_accuracy_mean)
skovhmm_1src_accuracy_mean['scenario'] = ['true'] * len(skovhmm_1src_accuracy_mean)

sstar_2src_accuracy = pd.read_csv(snakemake.input.sstar_2src_accuracy, sep="\t").dropna()
sprime_2src_accuracy = pd.read_csv(snakemake.input.sprime_2src_accuracy, sep="\t").dropna()
archaicseeker2_2src_accuracy = pd.read_csv(snakemake.input.archaicseeker2_2src_accuracy, sep="\t").dropna()

sstar_2src_accuracy_grouped = sstar_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)
sprime_2src_accuracy_grouped = sprime_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)
archaicseeker2_2src_accuracy_grouped = archaicseeker2_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False)

sstar_2src_accuracy_mean = sstar_2src_accuracy_grouped.mean()
sprime_2src_accuracy_mean = sprime_2src_accuracy_grouped.mean()
archaicseeker2_2src_accuracy_mean = archaicseeker2_2src_accuracy_grouped.mean()

sstar_2src_accuracy_mean.to_csv(snakemake.output.sstar_2src_accuracy_mean, sep="\t", index=False)
sprime_2src_accuracy_mean.to_csv(snakemake.output.sprime_2src_accuracy_mean, sep="\t", index=False)
archaicseeker2_2src_accuracy_mean.to_csv(snakemake.output.archaicseeker2_2src_accuracy_mean, sep="\t", index=False)

methods1 = ['sstar', 'sprime', 'skovhmm']
demography1 = ['HumanNeanderthal', 'BonoboGhost']
samples = ['nref_10_ntgt_1', 'nref_50_ntgt_1']
scenarios = ['true', 'const', 'ref_tgt_only']
accuracy1 = {
    'sstar': sstar_1src_accuracy_mean,
    'sprime': sprime_1src_accuracy_mean,
    'skovhmm': skovhmm_1src_accuracy_mean,
}
methods2 = [
    'sstar', 
    'sprime', 
    'archaicseeker2'
]
demography2 = ['HumanNeanderthalDenisovan', 'ChimpBonoboGhost']
accuracy2 = {
    'sstar': sstar_2src_accuracy_mean,
    'sprime': sprime_2src_accuracy_mean,
    'archaicseeker2': archaicseeker2_2src_accuracy_mean,
}

fig, axs = plt.subplots(nrows=2, ncols=3, constrained_layout=True, figsize=(7.5,4), dpi=350)
gridspec = axs[0, 0].get_subplotspec().get_gridspec()
for a in axs[:,2]:
    a.remove()

markers = {
    'nref_10_ntgt_1': {'symbol':'.', 'size': 6},
    'nref_50_ntgt_1': {'symbol':'*', 'size': 6},
}

colors = {
    'sstar': {'true': 'blue', 'const': 'cyan', 'ref_tgt_only': 'purple'},
    'skovhmm': 'green',
    'sprime': 'orange',
    'archaicseeker2': 'magenta',
}

linestyles = {
    'const': 'dotted',
    'true': 'solid',
    'ref_tgt_only': (0, (3, 1, 1, 1, 1, 1)),
}

titles = {
    'HumanNeanderthal': 'Human-Neanderthal model',
    'BonoboGhost': 'Bonobo-Ghost model',
    'HumanNeanderthalDenisovan': 'Human-Neanderthal-Denisovan model',
    'ChimpBonoboGhost': 'Chimpanzee-Ghost-Bonobo model',
}

zorders = {
    'sstar': 2,
    'skovhmm': 5,
    'sprime': 10,
}

j = 0
for d in demography1:
    for s in samples:
        for sc in scenarios:
            for m in methods1:
                if m == 'sstar': color = colors[m][sc]
                else: color = colors[m]
                df = accuracy1[m][
                        (accuracy1[m]['demography'] == d) &
                        (accuracy1[m]['sample'] == s) &
                        (accuracy1[m]['scenario'] == sc)
                    ].sort_values(by='recall', ascending=False)
                recall = df['recall']
                precision = df['precision']
                if (m == 'sprime') or (m == 'skovhmm'):
                    if sc != 'true': continue

                if d == 'BonoboGhost':
                    axs[0,j].plot(recall, precision,
                        marker=markers[s]['symbol'], ms=markers[s]['size'],
                        c=color, zorder=zorders[m])
                else:
                    axs[0,j].plot(recall, precision,
                        marker=markers[s]['symbol'], ms=markers[s]['size'],
                        c=color)

    axs[0,j].set_xlabel('Recall (%)', fontsize=10)
    axs[0,j].set_ylabel('Precision (%)', fontsize=10)
    axs[0,j].set_xlim([-5, 105])
    axs[0,j].set_ylim([-5, 105])
    axs[0,j].set_title(titles[d], fontsize=8, weight='bold')
    if j == 0: 
        axs[0,j].text(-35, 110, 'B', fontsize=10, weight='bold')
        axs[0,j].plot([0,100],[2.25,2.25], c='red', alpha=0.5)
    if j == 1: 
        axs[0,j].text(-35, 110, 'C', fontsize=10, weight='bold')
        axs[0,j].plot([0,100],[2,2], c='red', alpha=0.5)

    f_scores = np.linspace(20, 80, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(1, 100)
        y = f_score * x / (2 * x - f_score)
        (l,) = axs[0,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1)
        axs[0,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8)

    j += 1

j = 0
for d in demography2:
    for s in samples:
        for m in methods2:
            if m == 'sstar': color = colors[m]['true']
            else: color = colors[m]
            src1_df = accuracy2[m][
                         (accuracy2[m]['demography'] == d) &
                         (accuracy2[m]['sample'] == s) &
                         (accuracy2[m]['src'] == 'src1')
                     ].sort_values(by='recall', ascending=False)
            src2_df = accuracy2[m][
                         (accuracy2[m]['demography'] == d) &
                         (accuracy2[m]['sample'] == s) &
                         (accuracy2[m]['src'] == 'src2')
                     ].sort_values(by='recall', ascending=False)
            src1_recall = src1_df['recall']
            src1_precision = src1_df['precision']
            src2_recall = src2_df['recall']
            src2_precision = src2_df['precision']
            axs[1,j].plot(src1_recall, src1_precision,
                     marker=markers[s]['symbol'], ms=markers[s]['size'],
                     c=color, markerfacecolor='white')
            axs[1,j].plot(src2_recall, src2_precision,
                     marker=markers[s]['symbol'], ms=markers[s]['size'],
                     c=color, linestyle='dashdot')

    axs[1,j].set_xlabel('Recall (%)', fontsize=10)
    axs[1,j].set_ylabel('Precision (%)', fontsize=10)
    axs[1,j].set_xlim([-5, 105])
    axs[1,j].set_ylim([-5, 105])
    axs[1,j].set_title(titles[d], fontsize=8, weight='bold')
    if j == 0: 
        axs[1,j].text(-35, 110, 'D', fontsize=10, weight='bold')
        axs[1,j].plot([0,100],[0.2,0.2], c='red', alpha=0.5)
        axs[1,j].plot([0,100],[4,4], c='red', linestyle='dotted')
    if j == 1: 
        axs[1,j].text(-35, 110, 'E', fontsize=10, weight='bold')
        axs[1,j].plot([0,100],[2,2], c='red', alpha=0.5)
        axs[1,j].plot([0,100],[2,2], c='red', linestyle='dotted')

    f_scores = np.linspace(20, 80, num=4)
    lines, labels = [], []
    for f_score in f_scores:
        x = np.linspace(1, 100)
        y = f_score * x / (2 * x - f_score)
        (l,) = axs[1,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1)
        axs[1,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8)

    j += 1

# legend
subfig = fig.add_subfigure(gridspec[:,2])
handles, labels = subfig.gca().get_legend_handles_labels()
sstar_line = plt.Line2D([0], [0], label='sstar (full)', color=colors['sstar']['true'])
sstar_line2 = plt.Line2D([0], [0], label='sstar (constant)', color=colors['sstar']['const'])
sstar_line3 = plt.Line2D([0], [0], label='sstar (only ref & tgt)', color=colors['sstar']['ref_tgt_only'])
skovhmm_line = plt.Line2D([0], [0], label='SkovHMM', color=colors['skovhmm'])
sprime_line = plt.Line2D([0], [0], label='SPrime', color=colors['sprime'])
archaicseeker2_line = plt.Line2D([0], [0], label='ArchaicSeeker2.0', color=colors['archaicseeker2'])
baseline1 = plt.Line2D([0], [0], label='baseline/src1 baseline', color='red', alpha=0.5)
baseline2 = plt.Line2D([0], [0], label='src2 baseline', color='red', linestyle='dotted')
f1_curves = plt.Line2D([0], [0], label='iso-F1 curves', color='black', alpha=0.4, linestyle='dotted')
nref_10_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_10_ntgt_1']['symbol'],
                            ms=5, label='Nref = 10', color='black', linewidth=0)
nref_50_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_50_ntgt_1']['symbol'],
                            ms=5, label='Nref = 50', color='black', linewidth=0)
src1 = plt.Line2D([0], [0], label='src1', color='black', marker='o', ms=4, markerfacecolor='white')
src2 = plt.Line2D([0], [0], label='src2', color='black', marker='o', ms=4, linestyle='dotted')

handles.extend([sstar_line, sstar_line2, sstar_line3, skovhmm_line, sprime_line, archaicseeker2_line,
                baseline1, baseline2, f1_curves, nref_10_ntgt_1, nref_50_ntgt_1, src1, src2])
subfig.legend(handles=handles, fontsize=8, handlelength=1.5)

fig.set_constrained_layout_pads(w_pad=4 / 72, h_pad=4 / 72, hspace=0, wspace=0.1)
plt.savefig(snakemake.output.accuracy, bbox_inches='tight')

Python Pandas numpy matplotlib seaborn sklearn BiocSklearn From line 1 of workflows/plots.py

General-purpose Snakemake workflow for diffusion-based subcortical parcellation

import sklearn
import numpy as np
import nibabel as nib

# Define a function for saving niftis 
def save_label_nii (labels,mask,affine,out_nifti):
    labels_vol = np.zeros(mask.shape)
    labels_vol[mask > 0] = labels+1 #add a 1 so label 0 is diff from bgnd
    labels_nib = nib.Nifti1Image(labels_vol,affine)
    nib.save(labels_nib,out_nifti)

data = np.load(snakemake.input.connmap_group_npz)
cluster_range = range(2,snakemake.params.max_k+1)
out_nii_list = snakemake.output

conn_group = data['conn_group']
mask = data['mask']
affine = data['affine']

# Concat subjects
conn_group_m = np.moveaxis(conn_group,0,2)
conn_concat = conn_group_m.reshape([conn_group_m.shape[0],conn_group_m.shape[1]*conn_group_m.shape[2]])

# Run spectral clustering and save output nifti
for i,k in enumerate(cluster_range):
    from sklearn.cluster import SpectralClustering
    clustering = SpectralClustering(n_clusters=k, assign_labels="discretize",random_state=0,affinity='cosine').fit(conn_concat)
    print(f'i={i}, k={k},saving {out_nii_list[i]}')
    save_label_nii(clustering.labels_,mask,affine,out_nii_list[i])

Python numpy sklearn nibabel BiocSklearn From line 1 of scripts/spectral_clustering.py

Snakemake workflow: Zona Incerta Diffusion Parcellation

import sklearn
import numpy as np
import nibabel as nib

# Define a function for saving niftis 
def save_label_nii (labels,mask,affine,out_nifti):
    labels_vol = np.zeros(mask.shape)
    labels_vol[mask > 0] = labels+1 #add a 1 so label 0 is diff from bgnd
    labels_nib = nib.Nifti1Image(labels_vol,affine)
    nib.save(labels_nib,out_nifti)

data = np.load(snakemake.input.connmap_group_npz)
cluster_range = range(2,snakemake.params.max_k+1)
out_nii_list = snakemake.output

conn_group = data['conn_group']
mask = data['mask']
affine = data['affine']

# Concat subjects
conn_group_m = np.moveaxis(conn_group,0,2)
conn_concat = conn_group_m.reshape([conn_group_m.shape[0],conn_group_m.shape[1]*conn_group_m.shape[2]])

# Run spectral clustering and save output nifti
for i,k in enumerate(cluster_range):
    from sklearn.cluster import SpectralClustering
    clustering = SpectralClustering(n_clusters=k, assign_labels="discretize",random_state=0,affinity='cosine').fit(conn_concat)
    print(f'i={i}, k={k},saving {out_nii_list[i]}')
    save_label_nii(clustering.labels_,mask,affine,out_nii_list[i])

Python numpy sklearn nibabel BiocSklearn From line 1 of scripts/spectral_clustering.py

Workflow for transcript expression analysis.

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA

sleuth_matrix = pd.read_csv(snakemake.input[0], sep='\t')

samples = list(sleuth_matrix)
count_sam = sleuth_matrix.shape[1]

ind = list(range(0, count_sam))

sleuth_matrix = sleuth_matrix.transpose()

n_components = 2

pca = PCA(n_components=n_components)

sl_pca = pca.fit_transform(sleuth_matrix)

colors = plt.cm.get_cmap("hsv", count_sam+1)

plt.figure(figsize=(8, 8))
for i, sam in zip(ind, samples):
    plt.scatter(sl_pca[i, 0], sl_pca[i, 1], color=colors(i), lw=2, label=sam)
plt.title("PCA of Transcriptexpression")
plt.legend(loc="best", shadow=False, scatterpoints=1)


#plt.show()

plt.savefig(snakemake.output[0])

Python Pandas matplotlib sklearn From line 1 of py_scripts/pca_plot.py

Automatic identification, classification and annotation of plant lncRNAs in transcriptomic sequences

import numpy as np
from sklearn.model_selection import train_test_split
from Bio import SeqIO
import sys
import os

lnc = SeqIO.to_dict(SeqIO.parse(sys.argv[1], "fasta"))
cds = SeqIO.to_dict(SeqIO.parse(sys.argv[2], "fasta"))
train_lnc = open('./data/input/test_train/train_lnc.fasta', 'w', encoding='utf-8')
print(len(lnc.keys()))
print(len(cds.keys()))
test_lnc = len(lnc.keys())*0.25
test_mrna = test_lnc*2
n=0
keys = list(cds.keys())
pers_lnc=test_lnc/len((list(lnc.keys())))
Y_train, Y_test = train_test_split(list(lnc.keys()), test_size=pers_lnc, shuffle=True)
print(len(Y_test))
if len(Y_train)*2 <= len(cds.keys())/5:
    train_mrna = len(Y_train)*2
else:
    train_mrna = len(cds.keys())/5.2
print(train_mrna)
for w in Y_train:
    print(lnc[w].format('fasta'), end='', file=train_lnc)

while n <= 4:
    os.mkdir('./data/input/test_train/' + str(n))
    train_cds = open('./data/input/test_train/' + str(n) + '/train_mrna.fasta', 'w', encoding='utf-8')
    test_file = open('./data/input/test_train/' + str(n) + '/test.fasta', 'w', encoding='utf-8')
    compare = open('./data/input/test_train/' + str(n) + '/compare.csv', 'w', encoding='utf-8')
    pers_mrna=train_mrna/(len(keys))
    print(pers_mrna)
    X_train, X_test = train_test_split(keys, test_size=pers_mrna, shuffle=True)
    print('train_set',len(X_train))
    print('test_set',len(X_test))
    if len(X_test) > test_mrna:
    	pers_mrna_test = test_mrna/(len(X_test))
    else:
        true_per = len(X_test)*0.25
        pers_mrna_test = true_per/len(X_test)
    pers_mrna_test = test_mrna/(len(X_test))
    train, test = train_test_split(X_test, test_size=pers_mrna_test, shuffle=True)
    for w in test:
        print(cds[w].format('fasta'), end='', file=test_file)
        print(cds[w].id , 'mrna', sep='\t', file=compare)
    for w in train:
        print(cds[w].format('fasta'), end='', file=train_cds)
    for w in Y_test:
        print(lnc[w].format('fasta'), end='', file=test_file)
        print(lnc[w].id, 'lnc', sep='\t', file=compare)
    keys = [w for w in keys if w not in X_test]
    print('new_set',len(keys))
    n=n+1

Python numpy Biopython sklearn From line 1 of scripts/test_train.py

Use an ensemble of variant callers to call variants from ATAC-seq data (v0.3.3)

import sys
import pysam
import argparse
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import roc_curve



def phred(vals):
    """ apply the phred scale to the vals provided """
    with np.errstate(divide='raise'):
        try:
            return -10*np.log10(1-vals)
        except FloatingPointError:
            return np.float64(30)
            return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30

def plot_line(lst, show_discards=False):
    plt.clf()
    roc = np.copy(lst.T)
    roc[1] = phred(roc[1])
    max_val = phred(max(roc[2])/(max(roc[2])+1))
    # discard inf or na cols
    inf_or_na_cols = np.isinf(roc).any(axis=0) | np.isnan(roc).any(axis=0)
    # warn the user if we're discarding the majority of points
    discarded = np.sum(inf_or_na_cols)/roc.shape[1]*100
    if (not show_discards and discarded != 0) or discarded >= 50:
        warnings.warn("Discarding NaN or Inf points ({}% of points)".format(discarded))
    roc = roc[:,~(inf_or_na_cols)]
    # perform a simple linear regression
    p = np.polyfit(roc[0], roc[1], 1)
    r_squared = 1 - (sum((roc[1] - (p[0] * roc[0] + p[1]))**2) / ((len(roc[1]) - 1) * np.var(roc[1], ddof=1)))
    p = np.poly1d(p)
    # plot the points and the line
    plt.scatter(roc[0], roc[1], color='r', label="_nolegend_")
    if max(roc[0]) <= 1:
        plt.xlabel("RF Probability")
    elif max(roc[0]) <= np.pi/2:
        plt.xlabel("Reverse Arcsin of RF Probability")
    else:
        plt.xlabel("Phred-Scaled RF Probability")
    plt.ylabel("Phred-Scaled Precision (QUAL)")
    plt.plot(
        roc[0],
        p(roc[0]),
        label=str(p)+"\nr-squared: "+str(round(r_squared, 2))+ \
            ("\ndiscarded: "+str(int(discarded))+"%" if show_discards else "")
    )
    plt.hlines(max_val, min(roc[0]), max(roc[0]), colors='g', linestyles='dashed')
    plt.legend(frameon=False, loc='lower right')
    plt.tight_layout()

def eqbin_mean(grp, log=True, pseudo=True, discards_ok=False, inverse=False):
    if inverse:
        return np.arcsin(grp.mean())
    else:
        if log:
            if discards_ok or not pseudo:
                return phred(grp).mean()
            else:
                return phred(grp.sum()/(len(grp) + pseudo))
        else:
           return grp.mean()

def tpr_probs(df, bins=15, eqbin=True, log=True, pseudo=True, discards_ok=False, inverse=False):
    """ bin the sites and calculate an accuracy (predicted positive value) for that bin """
    # retrieve only predicted positives
    df = df[df['varca~CLASS:']>=0.5]
    if eqbin:
        bin_size = int(len(df)/bins)
        # create bins (ie each grp) and add a single false positive to it so we don't get Inf
        lst = np.array([
            (
                eqbin_mean(grp['varca~prob.1'], log, pseudo, discards_ok, inverse),
                precision_score(
                    np.append(grp['varca~truth'].values, 0) if pseudo else grp['varca~truth'],
                    np.append(grp['varca~CLASS:'].values, 1) if pseudo else grp['varca~CLASS:']
                ),
                grp['varca~prob.1'].size
            )
            for grp in (df.iloc[i:i+bin_size] for i in range(0,len(df)-bin_size+1,bin_size))
        ])
    else:
        if log:
            df = df.copy()
            df['varca~prob.1'] = phred(df['varca~prob.1'])
        start = phred(0.5) if log else 0.5
        # get the end excluding inf values (in case log == True)
        end = df.loc[df['varca~prob.1'] != np.inf, 'varca~prob.1'].max()
        # create bins (ie each grp) and add a single false positive to it so we don't get Inf
        lst = np.array([
            (
                grp[1]['varca~prob.1'].mean(),
                precision_score(
                    np.append(grp[1]['varca~truth'].values, 0) if pseudo else grp['varca~truth'],
                    np.append(grp[1]['varca~CLASS:'].values, 1) if pseudo else grp['varca~CLASS:']
                ),
                grp[1]['varca~prob.1'].size
            )
            for grp in df.groupby(pd.cut(df['varca~prob.1'], pd.interval_range(start, end, bins)))
        ])
    return lst



def strip_type(caller):
    """
        strip the -indel or -snp from the end of a caller name
    """
    vartype = ''
    if caller.endswith('-snp'):
        caller = caller[:-len('-snp')]
        vartype = 'snp'
    elif caller.endswith('-indel'):
        caller = caller[:-len('-indel')]
        vartype = 'indel'
    # if there is still a dash, get everything after it
    i = caller.rfind('-')
    if i != -1:
        caller = caller[i+1:]
    return caller, vartype

def isnan(val):
    return type(val) is float and np.isnan(val)

def get_calls(prepared, callers=None, pretty=False):
    """
        get the alleles in each row of prepared at the location (CHROM/POS) of loc
        when choosing an alt allele, choose from the callers in the order given
    """
    # keep track of the contigs that we've seen
    contigs = set()
    # whether we've read the header yet
    read_header = False
    # iterate through each row in the df and check whether they match loc
    for chunk in prepared:
        # do some special stuff (parsing the header) on the very first iteration
        if not read_header:
            # if callers is None, retrieve the callers from the columns of the dataframe
            if callers is None:
                callers = [
                    caller[:-len('~ALT')] for caller in chunk.columns
                    if caller.endswith('~ALT') and not caller.startswith('pg-')
                ]
            # what types of variants are we dealing with? let's count how many
            # times they appear in the caller names
            vartypes = {'snp': 0, 'indel': 0}
            # also, let's retrieve the callers as a dictionary
            pretty_callers = {}
            for caller in callers:
                pretty_caller, vartype = strip_type(caller)
                # don't beautify the callers if the user didn't request it
                pretty_callers[caller] = pretty_caller if pretty else caller
                # keep track of how often each vartype appears
                if vartype in vartypes:
                    vartypes[vartype] += 1
            callers = pretty_callers
            # retrieve the first CHROM/POS location and yield whether we are reading indels or snps
            loc, predict = yield max(vartypes, key=vartypes.get)
            read_header = True
        # now iterate through each row (and also convert the POS column to an int)
        for idx, row in chunk.iterrows():
            # check if we already passed the row -- ie we either:
            # 1) moved onto a new contig or
            # 2) moved passed the position
            while (
                idx[0] != loc[0] and loc[0] in contigs
            ) or (
                idx[0] == loc[0] and idx[1] > loc[1]
            ):
                # return None if we couldn't find loc in the df
                loc, predict = yield None
            if idx == loc:
                # we found it!
                found = False
                # now, we must figure out which caller to get the alleles from
                for caller in callers:
                    ref, alt = row[caller+"~REF"], row[caller+"~ALT"]
                    # TODO: make this work for an arbitrary number of variant types for multilabel classification using the other CLASS values in classified
                    # right now, this only works if there's a single binary label
                    if not isnan(ref) and (
                        (isnan(alt) + predict) % 2
                    ):
                        found = True
                        break
                if found:
                    loc, predict = yield callers[caller], ref, alt
                else:
                    # if we know there is a variant here, but none of the other
                    # callers found it, just label it as a non-variant
                    # TODO: figure out the alt allele from inspecting the ref genome?
                    loc, predict = yield 'varca', row['REF'], float('nan')
            # save the current contig so that we know which ones we've seen
            contigs.add(idx[0])

def prob2qual(prob, vartype):
    # values are from linear model that we created from using the "-i" option
    if vartype == 'snp':
        return 0.6237*phred(prob)+8.075
    elif vartype == 'indel':
        return 0.8463*phred(prob)+2.724
    else:
        # we shouldn't ever encounter this situation, but just in case...
        return phred(prob)

def main(prepared, classified, callers=None, cs=1000, all_sites=False, pretty=False, vartype=None):
    """
        use the results of the prepare pipeline and the classify pipeline
        to create a VCF with all of the classified sites
    """
    # first, get a generator that can read through each call in the prepared df
    prepared = get_calls(
        pd.read_csv(
            prepared, sep="\t", header=0, index_col=["CHROM", "POS"],
            dtype=str, chunksize=cs, na_values=".",
            usecols=lambda col: col in ['CHROM', 'POS', 'REF'] or col.endswith('~REF') or col.endswith('~ALT')
        ), callers, pretty
    )
    # flush out the first item in the generator: the vartype
    if vartype is None:
        vartype = next(prepared)
    else:
        # if the user already gave us the vartype, then just discard this
        next(prepared)
    # also retrieve the classifications as a df
    classified = pd.read_csv(
        classified, sep="\t", header=0, index_col=["CHROM", "POS"],
        dtype={'CHROM':str, 'POS':int}, chunksize=cs, na_values="."
    )
    # keep track of how many sites in the classifications df we've had to skip
    skipped = 0
    # keep track of how many sites we skipped but were predicted to have a variant
    no_alts = 0
    # iterate through each site in the classifications df, get its alleles, and
    # then return them in a nice-looking dictionary
    for chunk in classified:
        for idx, row in chunk.iterrows():
            try:
                # get the alleles for this CHROM/POS location
                call = prepared.send((idx, row['varca~CLASS:']))
            except StopIteration:
                call = None
            # we found a variant but couldn't find an alternate allele!
            no_alts += not (call is None or (isnan(call[2]) + row['varca~CLASS:']) % 2)
            # check: does the site appear in the prepared pipeline?
            # and does this site have a variant?
            if call is None or (not all_sites and isnan(call[2])):
                skipped += 1
                continue
            caller, ref, alt = call
            qual = prob2qual(
                row['varca~prob.'+str(int(not isnan(alt)))], vartype
            )
            # construct a dictionary with all of the relevant details
            yield {
                'contig': str(idx[0]), 'start': idx[1], 'stop': idx[1]+len(ref),
                'qual': qual, 'alleles': (ref, "." if isnan(alt) else alt), 'info': {'CALLER':caller}
            }
    if skipped:
        warnings.warn(
            "Ignored {:n} classification sites that didn't have a variant.".format(skipped)
        )
    if no_alts:
        warnings.warn(
            "Ignored {:n} sites that we predicted to have variants but didn't appear in any of the callers.".format(no_alts)
        )

def write_vcf(out, records):
    """
        write the records to the output vcf
    """
    vcf = pysam.VariantFile(args.out, mode='w')
    # write the necessary VCF header info
    vcf.header.info.add("CALLER", 1, 'String', "The caller from which this site was taken")
    contigs = set()
    for rec in records:
        # handle pysam increasing the start and end sites by 1
        rec['start'] -= 1
        rec['stop'] -= 1
        # parse the record into a pysam.VariantRecord
        try:
            record = vcf.new_record(
                **rec, samples=None, id=None, filter=None
            )
        except ValueError:
            # add the contig if it hasn't already been added
            if rec['contig'] not in contigs:
                vcf.header.contigs.add(rec['contig'])
                contigs.add(rec['contig'])
            else:
                raise
            # now, try again
            record = vcf.new_record(
                **rec, samples=None, id=None, filter=None
            )
        # write the record to a file
        vcf.write(record)
    return vcf

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-o", "--out", default=sys.stdout, help="the filename to save the vcf (or bcf) to"
    )
    parser.add_argument(
        "classified", type=Path, help="a sorted, results.tsv.gz file from the output of the classify pipeline"
    )
    parser.add_argument(
        "prepared", type=Path, nargs="?", default=sys.stdin, help="a sorted, merge.tsv.gz file from the prepare pipeline (if not supplied, this is read from stdin)"
    )
    parser.add_argument(
        '-c', "--callers", default=None, help="a comma separated list of the callers from which to choose alleles, supplied in order of priority (default: all of the callers in the file, in the order they appear)"
    )
    parser.add_argument(
        '-s', "--chunksize", type=np.uint32, default=100000, help="how many rows to read into memory at once (default: 100,000)"
    )
    parser.add_argument(
        '-a', '--all', action='store_true', help="whether to also write non-variant sites to create a gVCF (default: no)"
    )
    parser.add_argument(
        '-p', '--pretty', action='store_true', help="should caller names appear in the vcf by their pretty form (with all dashes intelligently removed) or their original caller ID form? (default: the original form)"
    )
    parser.add_argument(
        '-t', '--type', choices=['indel', 'snp'], default=None, help="whether to recalibrate QUAL values assuming your data are SNPs or indels (default: infer from callers)"
    )
    parser.add_argument(
        '-i', '--internal', action='store_true', help="For testing and internal use: recalibrate the QUAL scores (assumes varca~truth column exists in classified)"
    )
    args = parser.parse_args()

    callers = None
    if args.callers is not None:
        callers = args.callers.split(",")

    if not args.internal:
        import matplotlib
        matplotlib.use('Agg')
        vcf = write_vcf(args.out, main(args.prepared, args.classified, callers, args.chunksize, args.all, args.pretty, args.type))
    else:
        if not sys.flags.interactive:
            sys.exit("ERROR: You must run this script in python's interactive mode (using python's -i flag) when providing the -i flag to this script.")
        try:
            df = pd.read_csv(args.classified, sep="\t", header=0, index_col=["CHROM", "POS"], usecols=['CHROM', 'POS', 'varca~truth', 'varca~prob.1', 'varca~CLASS:'], low_memory=False).sort_values(by='varca~prob.1')
        except ValueError:
            df = pd.read_csv(args.classified, sep="\t", header=0, index_col=["CHROM", "POS"], usecols=['CHROM', 'POS', 'breakca~truth', 'breakca~prob.1', 'breakca~CLASS:'], low_memory=False).sort_values(by='breakca~prob.1')
            df.columns = ['varca~truth', 'varca~prob.1', 'varca~CLASS:']
        plt.ion()
        plot_line(tpr_probs(df))

Python Pandas numpy matplotlib pysam sklearn From line 2 of scripts/2vcf.py

import sys
import argparse
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn.metrics import auc
from itertools import product


parser = argparse.ArgumentParser()
parser.add_argument(
    "out", nargs="?", default=sys.stdout, help="the filename to save the data to"
)
known_args, unknown_args = parser.parse_known_args()
# dynamically parse whatever options the user passes us
count = 0
for arg in unknown_args:
    if arg.startswith('--'):
        parser.add_argument('--{}'.format(arg[2:]), type=argparse.FileType('r'))
        count += 1
if count < 1:
    parser.error("Specify the path to at least one space separated table (w/o a header) with two rows (recall/precision) using options like --gatk-indel path/to/gatk-table")
args = parser.parse_args()
all_args = vars(args)


def get_marker():
    """ retrieve a unique marker in a deterministic order """
    yield from product(range(2, 6), range(1, 3), [52])
    # if that wasn't enough, create some more just at a different angle
    yield from product(range(2, 8), range(1, 3), [0])
    # if we ever get to this point, we need more than 20 markers
    yield from product([2]+list(range(3, 6, 2)), range(1, 3), [90])


# initialize all of the colors
callers = ['gatk_indel', 'varscan_indel', 'vardict_indel', 'breakca', 'delly', 'pindel', 'illumina_manta', 'illumina_strelka', 'pg_indel']
colors = dict(zip(callers, plt.cm.Set1(range(len(callers)))))
# add the other callers, as well
for k in list(colors.keys()):
    if k.endswith('_indel'):
        colors[k[:-len('indel')]+"snp"] = colors[k]
    elif k == 'breakca':
        colors['varca'] = colors[k]

markers = get_marker()
# go through each table and get its name from all of the args
for arg in sorted(all_args.keys()):
    if arg not in known_args:
        table = np.loadtxt(all_args[arg])
        # recall: 1st row, precision: 2nd row
        if table.ndim != 1:
            # area = auc(table[0], table[1])
            # colors[arg] = plt.step(
            #     table[0], table[1], where='post',
            #     label=arg+": area={0:0.2f}".format(area)
            # )[0].get_color()
            if arg in colors:
                plt.step(
                    table[0], table[1], where='post', color=colors[arg],
                    label=arg.replace('breakca', 'varca')
                )
            else:
                # pick the color randomly
                plt.step(
                    table[0], table[1], where='post',
                    label=arg.replace('breakca', 'varca')
                )
        else:
            area = table[1]
            # check if this pt has a curve of the same name
            # to make sure they're the same color
            if arg.endswith("_pt") and arg[:-3] in colors:
                extra = {'color': colors[arg[:-3]]}
            else:
                extra = {}
            plt.plot(
                table[0], table[1], marker=next(markers), ms=12,
                label=arg.replace('breakca', 'varca')+": height={0:0.2f}".format(area), **extra
            )

plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize='small')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.0])
plt.xlim([0.0, 1.0])
plt.gcf().set_size_inches(10, 10)
plt.savefig(args.out, bbox_inches='tight', pad_inches=0.1, set_dpi=350)

Python numpy matplotlib sklearn delly From line 3 of scripts/prc.py

Snakemake workflow for comparison of differential abundance ranks (v0.3.0a2)

import logging

from joblib import dump
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (roc_curve, auc, precision_recall_curve,
                             average_precision_score)
from sklearn.model_selection import RepeatedStratifiedKFold


logger = logging.getLogger("qadabra")
logger.setLevel(logging.INFO)
fh = logging.FileHandler(snakemake.log[0], mode="w")
formatter = logging.Formatter(
    f"[%(asctime)s - {snakemake.rule}] :: %(message)s"
)
fh.setFormatter(formatter)
logger.addHandler(fh)

logging.captureWarnings(True)
logging.getLogger("py.warnings").addHandler(fh)

covariate = snakemake.params["factor_name"]
target = snakemake.params["target"]

logger.info(f"Covariate: {covariate}")
logger.info(f"Target: {target}")

metadata = pd.read_table(snakemake.input["metadata"], sep="\t",
                         index_col=0).dropna(subset=[covariate])
try:
    log_ratios = pd.read_table(snakemake.input["log_ratios"], sep="\t", index_col=0).join(metadata, how="inner")
    X = log_ratios["log_ratio"].values.reshape(-1, 1)
    y = (log_ratios[covariate] == target).astype(int).values

    mean_fpr = np.linspace(0, 1, 100)
    mean_rec = np.linspace(0, 1, 100)

    n_splits = snakemake.config["ml_params"]["n_splits"]
    n_repeats = snakemake.config["ml_params"]["n_repeats"]
    logger.info(
        f"Using repeated stratified k-fold CV with {n_splits} splits & "
        f"{n_repeats} repeats"
    )
    cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats,
                                random_state=1)

    model = LogisticRegression(penalty="none")
    folds = [(train, test) for train, test in cv.split(X, y)]
    model_data = {
        "folds": {
            f"fold_{i+1}": {"train_idx": train, "test_idx": test}
            for i, (train, test) in enumerate(folds)
        }
    }
    model_data["fprs"] = mean_fpr
    model_data["recalls"] = mean_rec

    model_data["log_ratios"] = X
    model_data["truth"] = y

    # https://stackoverflow.com/a/67968945
    tprs = []
    precs = []
    roc_aucs = []
    pr_aucs = []
    for i, (train, test) in enumerate(folds):
        prediction = model.fit(X[train], y[train]).predict_proba(X[test])

        fpr, tpr, _ = roc_curve(y[test], prediction[:, 1])
        tpr_interp = np.interp(mean_fpr, fpr, tpr)
        tprs.append(tpr_interp)
        roc_auc = auc(mean_fpr, tpr_interp)
        roc_aucs.append(roc_auc)

        prec, rec, _ = precision_recall_curve(y[test], prediction[:, 1])
        prec = prec[::-1]
        rec = rec[::-1]
        prec_interp = np.interp(mean_rec, rec, prec)
        precs.append(prec_interp)
        pr_auc = auc(mean_rec, prec_interp)
        pr_aucs.append(pr_auc)

        logger.info(
            f"Fold {i+1}/{len(folds)}: ROC AUC = {roc_auc:.2f}, "
            f"PR AUC = {pr_auc:.2f}"
        )

    model_data["tprs"] = tprs
    model_data["precisions"] = precs
    model_data["roc_aucs"] = roc_aucs
    model_data["pr_aucs"] = pr_aucs

    mean_roc_auc = np.mean(roc_aucs)
    mean_pr_auc = np.mean(pr_aucs)

    logger.info(f"Mean ROC AUC = {mean_roc_auc:.2f}")
    logger.info(f"Mean PR AUC = {mean_pr_auc:.2f}")

    dump(model_data, snakemake.output[0], compress=True)
    logger.info(f"Saving model to {snakemake.output[0]}")
except:
    logger.error(f"The file {snakemake.input['log_ratios']} is empty or does not exist")
    model_data = {}
    dump(model_data, snakemake.output[0], compress=True)
    logger.info(f"Saving empty model to {snakemake.output[0]}")

Python Pandas numpy sklearn joblib From line 1 of scripts/logistic_regression.py

import logging

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


logger = logging.getLogger("qadabra")
logger.setLevel(logging.INFO)
fh = logging.FileHandler(snakemake.log[0], mode="w")
formatter = logging.Formatter(
    f"[%(asctime)s - {snakemake.rule}] :: %(message)s"
)
fh.setFormatter(formatter)
logger.addHandler(fh)

logger.info("Loading differentials...")
diff_df = pd.read_table(snakemake.input[0], sep="\t", index_col=0)
rank_df = (
    diff_df.rank(ascending=False)
    .rename(columns={f"{x}": f"{x}_rank" for x in diff_df.columns})
    .join(diff_df)
)

tool_list = diff_df.columns.tolist()
tool_rank_list = [x + "_rank" for x in tool_list]

logger.info("Scaling values...")
scaled_diff_values = StandardScaler().fit_transform(diff_df.values)

pc_cols = [f"PC{x+1}" for x in range(len(tool_list))]
logger.info("Running PCA...")
pca = PCA(n_components=len(tool_list)).fit(scaled_diff_values)
feat_df = pd.DataFrame(
    pca.transform(scaled_diff_values),
    index=diff_df.index,
    columns=pc_cols
).join(rank_df)
max_vals = feat_df[pc_cols].max()

tool_df = pd.DataFrame(
    np.dot(pca.components_, np.diag(max_vals)),
    index=diff_df.columns,
    columns=pc_cols
)
tool_df.index.name = "tool"

feat_df.to_csv(snakemake.output["features"], sep="\t", index=True)
tool_df.to_csv(snakemake.output["tools"], sep="\t", index=True)

logger.info(f"Saved feature PCs to {snakemake.output['features']}")
logger.info(f"Saved tool PCs to {snakemake.output['tools']}")

prop_exp = pca.explained_variance_ratio_
prop_exp = pd.Series(prop_exp, index=pc_cols)
prop_exp.name = "proportion_var_explained"
prop_exp.to_csv(snakemake.output["prop_exp"], sep="\t", index=True)
logger.info(f"Saved explained variance to {snakemake.output['prop_exp']}")

Python Pandas numpy sklearn From line 1 of scripts/run_pca.py

degree project, predict patient outcome with RNA-seq data

import pandas as pd
import numpy as np
import os
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.preprocessing import OneHotEncoder
from sksurv.metrics import (
    as_concordance_index_ipcw_scorer,
    as_cumulative_dynamic_auc_scorer,
    as_integrated_brier_score_scorer,
    cumulative_dynamic_auc
)
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler


event = snakemake.params[0]
days = snakemake.params[1]


def load_data(dataP, spInfo, features):
    data = pd.read_csv(dataP).set_index('ID')
    sample_info = pd.read_csv(spInfo)
    data.drop(['MEAN','VAR'], axis=1, inplace=True)
    data = data.T.reset_index().rename(columns={'index':'SAMPLE'})
    data.columns.names = [None]
    data = data[['SAMPLE']+features]
    if 'RFi' in event:
        data = pd.merge(sample_info[['SAMPLE', 'RFi_days', 'RFi_event', 'RFS_all_event', 'age', 'treatGroup']], data, on='SAMPLE')
        data = data[data.RFi_days.notna()].reset_index(drop=True)
        data = data[data.apply(lambda x: False if (x['RFi_event'] == 0 and x['RFi_days'] < 600) or (x['RFi_event'] == False and x['RFS_all_event'] == True) else True, axis = 1)].reset_index(drop=True)
        data.drop(['RFS_all_event'], axis=1, inplace=True)
    else:
        data = pd.merge(sample_info[['SAMPLE', days, event, 'age', 'treatGroup']], data, on='SAMPLE')
        data = data[data[event].notna()].reset_index(drop=True)
        data = data[data[days].notna()].reset_index(drop=True)
    x, y = data.iloc[::, 3:], data.iloc[::, 1:3]
    treatment = x.treatGroup.astype('category')
    x.drop('treatGroup', axis=1, inplace=True)
    x = pd.DataFrame(StandardScaler().fit_transform(x), columns=x.columns).dropna(axis=1)
    x['treatGroup'] = treatment
    x = OneHotEncoder().fit_transform(x).replace(np.nan, 0)
    return x, y

output = snakemake.output[0]
method = output.split('_')[-2]

if method == 'uni':
    features = pd.read_csv(os.path.join(os.path.dirname(output), f'{event.split("_")[0]}_uni_fdr.csv'))
    features = features[features['p.adjust']<0.05]
    features = features['fetures'].to_list()
elif method == 'ens':
    features = pd.read_csv(os.path.join(os.path.dirname(output), f'{event.split("_")[0]}-best.feature'))
    features = features['Feature'].to_list()
elif method == '1run':
    which = output.split('/')[0]
    dataset = lambda x: 'genes' if 'gs' in x else 'transcripts'
    features = pd.read_csv(os.path.join(os.path.dirname(output), f'{event.split("_")[0]}_1run_{which}_{dataset(output.split("/")[1])}_best_coefs.csv'), index_col=0).reset_index()
    features = features['index'].to_list()
    features = [x for x in features if x != 'age' and not x.startswith('treat')]
else:
    features = []

x, y = load_data(snakemake.input[0], snakemake.input[1], features)
os_type = {'names':('events', 'time'), 'formats':('?', '<f8')}
y = np.array([(a, b) for a, b in zip(y[event].astype(bool), y[days])], dtype=os_type)

xtrain, xval, ytrain, yval = train_test_split(x, y, test_size=0.2, random_state=954)

lower, upper = np.percentile(pd.DataFrame(ytrain).time, [25, 75])
grids_times = np.arange(lower, upper + 1)

cv = KFold(n_splits=3, shuffle=True, random_state=0)
if method == 'uni':
    alphas = 10. ** np.linspace(-1.8, -1, 5)
else:
    alphas = 10. ** np.linspace(-1.9, -1.5, 5)

grids_cindex = GridSearchCV(
    as_concordance_index_ipcw_scorer(CoxnetSurvivalAnalysis(l1_ratio=0.9, tol=1e-15, max_iter=10000), tau=grids_times[-1]),
    param_grid={"estimator__alphas": [[v] for v in alphas]},
    cv=cv,
    error_score=0.5,
    n_jobs=16).fit(xtrain, ytrain)

cph = CoxnetSurvivalAnalysis(l1_ratio=0.9, alphas=[grids_cindex.best_params_["estimator__alphas"]])
cph.fit(xtrain, ytrain)

best_coefs = pd.DataFrame(cph.coef_, index=xtrain.columns, columns=['coefficient'])
non_zero = np.sum(best_coefs.iloc[:, 0] != 0)
non_zero_coefs = best_coefs.query("coefficient != 0")

va_times = np.arange(np.percentile(pd.DataFrame(yval).time, [5, 95])[0], np.percentile(pd.DataFrame(yval).time, [5, 95])[1], 60)
cph_risk_scores = cph.predict(xval)
auc, mean_auc = cumulative_dynamic_auc(ytrain, yval, cph_risk_scores, va_times)



baseline_feature = [x for x in xtrain.columns if x == 'age' or x.startswith('treat')]
base = CoxnetSurvivalAnalysis(l1_ratio=0.9, tol=1e-20, max_iter=10000, alphas=[0])
base.fit(xtrain[baseline_feature], ytrain)
base_risk_scores = base.predict(xval[baseline_feature])
base_auc, base_mean = cumulative_dynamic_auc(ytrain, yval, base_risk_scores, va_times)


with open(output, 'w') as fo:
    fo.write(f'## trainnig \n')
    fo.write(f'# mean cindex: {grids_cindex.cv_results_["mean_test_score"]} \n')
    fo.write(f'# best alpha: {grids_cindex.best_params_["estimator__alphas"]} \n')
    fo.write(f'## validation \n')
    fo.write(f'# cindex:{cph.score(xval, yval)} \n')
    fo.write(f'# features in: {len(features)}+age+treatment, features out: {non_zero} \n')
    fo.write(f'## baseline \n')
    fo.write(f'## features: {baseline_feature} \n')
    fo.write(f'# cindex: {base.score(xval[baseline_feature], yval)} \n')
    fo.write(f'# base_mean_auc: {base_mean} \n')


pd.DataFrame({'AUC':auc, 'base':base_auc, 'time':va_times, 'mean':[mean_auc]*len(auc)}).to_csv(output, index=False, mode='a')

Python Pandas numpy sklearn scikit-survival From line 1 of scripts/cox_analysis.py

Workflow Steps and Code Snippets

sklearn