Workflow Steps and Code Snippets
167 tagged steps and code snippets that match keyword sklearn
Snakemake pipelines for replicating sstar analysis
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | import numpy as np import matplotlib.pyplot as plt import pandas as pd import matplotlib matplotlib.use("Agg") import seaborn as sns sns.set_style("darkgrid") from sklearn import metrics sstar_1src_accuracy = pd.read_csv(snakemake.input.sstar_1src_accuracy, sep="\t").dropna() sprime_1src_accuracy = pd.read_csv(snakemake.input.sprime_1src_accuracy, sep="\t").dropna() skovhmm_1src_accuracy = pd.read_csv(snakemake.input.skovhmm_1src_accuracy, sep="\t").dropna() sstar_1src_accuracy_grouped = sstar_1src_accuracy.groupby(['demography', 'scenario', 'sample', 'cutoff'], as_index=False) sprime_1src_accuracy_grouped = sprime_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False) skovhmm_1src_accuracy_grouped = skovhmm_1src_accuracy.groupby(['demography', 'sample', 'cutoff'], as_index=False) sstar_1src_accuracy_mean = sstar_1src_accuracy_grouped.mean() sprime_1src_accuracy_mean = sprime_1src_accuracy_grouped.mean() skovhmm_1src_accuracy_mean = skovhmm_1src_accuracy_grouped.mean() sstar_1src_accuracy_mean.to_csv(snakemake.output.sstar_1src_accuracy_mean, sep="\t", index=False) sprime_1src_accuracy_mean.to_csv(snakemake.output.sprime_1src_accuracy_mean, sep="\t", index=False) skovhmm_1src_accuracy_mean.to_csv(snakemake.output.skovhmm_1src_accuracy_mean, sep="\t", index=False) sprime_1src_accuracy_mean['scenario'] = ['true'] * len(sprime_1src_accuracy_mean) skovhmm_1src_accuracy_mean['scenario'] = ['true'] * len(skovhmm_1src_accuracy_mean) sstar_2src_accuracy = pd.read_csv(snakemake.input.sstar_2src_accuracy, sep="\t").dropna() sprime_2src_accuracy = pd.read_csv(snakemake.input.sprime_2src_accuracy, sep="\t").dropna() archaicseeker2_2src_accuracy = pd.read_csv(snakemake.input.archaicseeker2_2src_accuracy, sep="\t").dropna() sstar_2src_accuracy_grouped = sstar_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False) sprime_2src_accuracy_grouped = sprime_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False) archaicseeker2_2src_accuracy_grouped = archaicseeker2_2src_accuracy.groupby(['demography', 'sample', 'cutoff', 'src'], as_index=False) sstar_2src_accuracy_mean = sstar_2src_accuracy_grouped.mean() sprime_2src_accuracy_mean = sprime_2src_accuracy_grouped.mean() archaicseeker2_2src_accuracy_mean = archaicseeker2_2src_accuracy_grouped.mean() sstar_2src_accuracy_mean.to_csv(snakemake.output.sstar_2src_accuracy_mean, sep="\t", index=False) sprime_2src_accuracy_mean.to_csv(snakemake.output.sprime_2src_accuracy_mean, sep="\t", index=False) archaicseeker2_2src_accuracy_mean.to_csv(snakemake.output.archaicseeker2_2src_accuracy_mean, sep="\t", index=False) methods1 = ['sstar', 'sprime', 'skovhmm'] demography1 = ['HumanNeanderthal', 'BonoboGhost'] samples = ['nref_10_ntgt_1', 'nref_50_ntgt_1'] scenarios = ['true', 'const', 'ref_tgt_only'] accuracy1 = { 'sstar': sstar_1src_accuracy_mean, 'sprime': sprime_1src_accuracy_mean, 'skovhmm': skovhmm_1src_accuracy_mean, } methods2 = [ 'sstar', 'sprime', 'archaicseeker2' ] demography2 = ['HumanNeanderthalDenisovan', 'ChimpBonoboGhost'] accuracy2 = { 'sstar': sstar_2src_accuracy_mean, 'sprime': sprime_2src_accuracy_mean, 'archaicseeker2': archaicseeker2_2src_accuracy_mean, } fig, axs = plt.subplots(nrows=2, ncols=3, constrained_layout=True, figsize=(7.5,4), dpi=350) gridspec = axs[0, 0].get_subplotspec().get_gridspec() for a in axs[:,2]: a.remove() markers = { 'nref_10_ntgt_1': {'symbol':'.', 'size': 6}, 'nref_50_ntgt_1': {'symbol':'*', 'size': 6}, } colors = { 'sstar': {'true': 'blue', 'const': 'cyan', 'ref_tgt_only': 'purple'}, 'skovhmm': 'green', 'sprime': 'orange', 'archaicseeker2': 'magenta', } linestyles = { 'const': 'dotted', 'true': 'solid', 'ref_tgt_only': (0, (3, 1, 1, 1, 1, 1)), } titles = { 'HumanNeanderthal': 'Human-Neanderthal model', 'BonoboGhost': 'Bonobo-Ghost model', 'HumanNeanderthalDenisovan': 'Human-Neanderthal-Denisovan model', 'ChimpBonoboGhost': 'Chimpanzee-Ghost-Bonobo model', } zorders = { 'sstar': 2, 'skovhmm': 5, 'sprime': 10, } j = 0 for d in demography1: for s in samples: for sc in scenarios: for m in methods1: if m == 'sstar': color = colors[m][sc] else: color = colors[m] df = accuracy1[m][ (accuracy1[m]['demography'] == d) & (accuracy1[m]['sample'] == s) & (accuracy1[m]['scenario'] == sc) ].sort_values(by='recall', ascending=False) recall = df['recall'] precision = df['precision'] if (m == 'sprime') or (m == 'skovhmm'): if sc != 'true': continue if d == 'BonoboGhost': axs[0,j].plot(recall, precision, marker=markers[s]['symbol'], ms=markers[s]['size'], c=color, zorder=zorders[m]) else: axs[0,j].plot(recall, precision, marker=markers[s]['symbol'], ms=markers[s]['size'], c=color) axs[0,j].set_xlabel('Recall (%)', fontsize=10) axs[0,j].set_ylabel('Precision (%)', fontsize=10) axs[0,j].set_xlim([-5, 105]) axs[0,j].set_ylim([-5, 105]) axs[0,j].set_title(titles[d], fontsize=8, weight='bold') if j == 0: axs[0,j].text(-35, 110, 'B', fontsize=10, weight='bold') axs[0,j].plot([0,100],[2.25,2.25], c='red', alpha=0.5) if j == 1: axs[0,j].text(-35, 110, 'C', fontsize=10, weight='bold') axs[0,j].plot([0,100],[2,2], c='red', alpha=0.5) f_scores = np.linspace(20, 80, num=4) lines, labels = [], [] for f_score in f_scores: x = np.linspace(1, 100) y = f_score * x / (2 * x - f_score) (l,) = axs[0,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1) axs[0,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8) j += 1 j = 0 for d in demography2: for s in samples: for m in methods2: if m == 'sstar': color = colors[m]['true'] else: color = colors[m] src1_df = accuracy2[m][ (accuracy2[m]['demography'] == d) & (accuracy2[m]['sample'] == s) & (accuracy2[m]['src'] == 'src1') ].sort_values(by='recall', ascending=False) src2_df = accuracy2[m][ (accuracy2[m]['demography'] == d) & (accuracy2[m]['sample'] == s) & (accuracy2[m]['src'] == 'src2') ].sort_values(by='recall', ascending=False) src1_recall = src1_df['recall'] src1_precision = src1_df['precision'] src2_recall = src2_df['recall'] src2_precision = src2_df['precision'] axs[1,j].plot(src1_recall, src1_precision, marker=markers[s]['symbol'], ms=markers[s]['size'], c=color, markerfacecolor='white') axs[1,j].plot(src2_recall, src2_precision, marker=markers[s]['symbol'], ms=markers[s]['size'], c=color, linestyle='dashdot') axs[1,j].set_xlabel('Recall (%)', fontsize=10) axs[1,j].set_ylabel('Precision (%)', fontsize=10) axs[1,j].set_xlim([-5, 105]) axs[1,j].set_ylim([-5, 105]) axs[1,j].set_title(titles[d], fontsize=8, weight='bold') if j == 0: axs[1,j].text(-35, 110, 'D', fontsize=10, weight='bold') axs[1,j].plot([0,100],[0.2,0.2], c='red', alpha=0.5) axs[1,j].plot([0,100],[4,4], c='red', linestyle='dotted') if j == 1: axs[1,j].text(-35, 110, 'E', fontsize=10, weight='bold') axs[1,j].plot([0,100],[2,2], c='red', alpha=0.5) axs[1,j].plot([0,100],[2,2], c='red', linestyle='dotted') f_scores = np.linspace(20, 80, num=4) lines, labels = [], [] for f_score in f_scores: x = np.linspace(1, 100) y = f_score * x / (2 * x - f_score) (l,) = axs[1,j].plot(x[y >= 0], y[y >= 0], color="black", alpha=0.4, linestyle='dotted', zorder=1) axs[1,j].annotate("F1={0:0.0f}%".format(f_score), xy=(101, y[45] + 2), fontsize=8) j += 1 # legend subfig = fig.add_subfigure(gridspec[:,2]) handles, labels = subfig.gca().get_legend_handles_labels() sstar_line = plt.Line2D([0], [0], label='sstar (full)', color=colors['sstar']['true']) sstar_line2 = plt.Line2D([0], [0], label='sstar (constant)', color=colors['sstar']['const']) sstar_line3 = plt.Line2D([0], [0], label='sstar (only ref & tgt)', color=colors['sstar']['ref_tgt_only']) skovhmm_line = plt.Line2D([0], [0], label='SkovHMM', color=colors['skovhmm']) sprime_line = plt.Line2D([0], [0], label='SPrime', color=colors['sprime']) archaicseeker2_line = plt.Line2D([0], [0], label='ArchaicSeeker2.0', color=colors['archaicseeker2']) baseline1 = plt.Line2D([0], [0], label='baseline/src1 baseline', color='red', alpha=0.5) baseline2 = plt.Line2D([0], [0], label='src2 baseline', color='red', linestyle='dotted') f1_curves = plt.Line2D([0], [0], label='iso-F1 curves', color='black', alpha=0.4, linestyle='dotted') nref_10_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_10_ntgt_1']['symbol'], ms=5, label='Nref = 10', color='black', linewidth=0) nref_50_ntgt_1 = plt.Line2D([0], [0], marker=markers['nref_50_ntgt_1']['symbol'], ms=5, label='Nref = 50', color='black', linewidth=0) src1 = plt.Line2D([0], [0], label='src1', color='black', marker='o', ms=4, markerfacecolor='white') src2 = plt.Line2D([0], [0], label='src2', color='black', marker='o', ms=4, linestyle='dotted') handles.extend([sstar_line, sstar_line2, sstar_line3, skovhmm_line, sprime_line, archaicseeker2_line, baseline1, baseline2, f1_curves, nref_10_ntgt_1, nref_50_ntgt_1, src1, src2]) subfig.legend(handles=handles, fontsize=8, handlelength=1.5) fig.set_constrained_layout_pads(w_pad=4 / 72, h_pad=4 / 72, hspace=0, wspace=0.1) plt.savefig(snakemake.output.accuracy, bbox_inches='tight') |
General-purpose Snakemake workflow for diffusion-based subcortical parcellation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | import sklearn import numpy as np import nibabel as nib # Define a function for saving niftis def save_label_nii (labels,mask,affine,out_nifti): labels_vol = np.zeros(mask.shape) labels_vol[mask > 0] = labels+1 #add a 1 so label 0 is diff from bgnd labels_nib = nib.Nifti1Image(labels_vol,affine) nib.save(labels_nib,out_nifti) data = np.load(snakemake.input.connmap_group_npz) cluster_range = range(2,snakemake.params.max_k+1) out_nii_list = snakemake.output conn_group = data['conn_group'] mask = data['mask'] affine = data['affine'] # Concat subjects conn_group_m = np.moveaxis(conn_group,0,2) conn_concat = conn_group_m.reshape([conn_group_m.shape[0],conn_group_m.shape[1]*conn_group_m.shape[2]]) # Run spectral clustering and save output nifti for i,k in enumerate(cluster_range): from sklearn.cluster import SpectralClustering clustering = SpectralClustering(n_clusters=k, assign_labels="discretize",random_state=0,affinity='cosine').fit(conn_concat) print(f'i={i}, k={k},saving {out_nii_list[i]}') save_label_nii(clustering.labels_,mask,affine,out_nii_list[i]) |
Snakemake workflow: Zona Incerta Diffusion Parcellation
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | import sklearn import numpy as np import nibabel as nib # Define a function for saving niftis def save_label_nii (labels,mask,affine,out_nifti): labels_vol = np.zeros(mask.shape) labels_vol[mask > 0] = labels+1 #add a 1 so label 0 is diff from bgnd labels_nib = nib.Nifti1Image(labels_vol,affine) nib.save(labels_nib,out_nifti) data = np.load(snakemake.input.connmap_group_npz) cluster_range = range(2,snakemake.params.max_k+1) out_nii_list = snakemake.output conn_group = data['conn_group'] mask = data['mask'] affine = data['affine'] # Concat subjects conn_group_m = np.moveaxis(conn_group,0,2) conn_concat = conn_group_m.reshape([conn_group_m.shape[0],conn_group_m.shape[1]*conn_group_m.shape[2]]) # Run spectral clustering and save output nifti for i,k in enumerate(cluster_range): from sklearn.cluster import SpectralClustering clustering = SpectralClustering(n_clusters=k, assign_labels="discretize",random_state=0,affinity='cosine').fit(conn_concat) print(f'i={i}, k={k},saving {out_nii_list[i]}') save_label_nii(clustering.labels_,mask,affine,out_nii_list[i]) |
Workflow for transcript expression analysis.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | import matplotlib.pyplot as plt import pandas as pd from sklearn.decomposition import PCA sleuth_matrix = pd.read_csv(snakemake.input[0], sep='\t') samples = list(sleuth_matrix) count_sam = sleuth_matrix.shape[1] ind = list(range(0, count_sam)) sleuth_matrix = sleuth_matrix.transpose() n_components = 2 pca = PCA(n_components=n_components) sl_pca = pca.fit_transform(sleuth_matrix) colors = plt.cm.get_cmap("hsv", count_sam+1) plt.figure(figsize=(8, 8)) for i, sam in zip(ind, samples): plt.scatter(sl_pca[i, 0], sl_pca[i, 1], color=colors(i), lw=2, label=sam) plt.title("PCA of Transcriptexpression") plt.legend(loc="best", shadow=False, scatterpoints=1) #plt.show() plt.savefig(snakemake.output[0]) |
Automatic identification, classification and annotation of plant lncRNAs in transcriptomic sequences
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | import numpy as np from sklearn.model_selection import train_test_split from Bio import SeqIO import sys import os lnc = SeqIO.to_dict(SeqIO.parse(sys.argv[1], "fasta")) cds = SeqIO.to_dict(SeqIO.parse(sys.argv[2], "fasta")) train_lnc = open('./data/input/test_train/train_lnc.fasta', 'w', encoding='utf-8') print(len(lnc.keys())) print(len(cds.keys())) test_lnc = len(lnc.keys())*0.25 test_mrna = test_lnc*2 n=0 keys = list(cds.keys()) pers_lnc=test_lnc/len((list(lnc.keys()))) Y_train, Y_test = train_test_split(list(lnc.keys()), test_size=pers_lnc, shuffle=True) print(len(Y_test)) if len(Y_train)*2 <= len(cds.keys())/5: train_mrna = len(Y_train)*2 else: train_mrna = len(cds.keys())/5.2 print(train_mrna) for w in Y_train: print(lnc[w].format('fasta'), end='', file=train_lnc) while n <= 4: os.mkdir('./data/input/test_train/' + str(n)) train_cds = open('./data/input/test_train/' + str(n) + '/train_mrna.fasta', 'w', encoding='utf-8') test_file = open('./data/input/test_train/' + str(n) + '/test.fasta', 'w', encoding='utf-8') compare = open('./data/input/test_train/' + str(n) + '/compare.csv', 'w', encoding='utf-8') pers_mrna=train_mrna/(len(keys)) print(pers_mrna) X_train, X_test = train_test_split(keys, test_size=pers_mrna, shuffle=True) print('train_set',len(X_train)) print('test_set',len(X_test)) if len(X_test) > test_mrna: pers_mrna_test = test_mrna/(len(X_test)) else: true_per = len(X_test)*0.25 pers_mrna_test = true_per/len(X_test) pers_mrna_test = test_mrna/(len(X_test)) train, test = train_test_split(X_test, test_size=pers_mrna_test, shuffle=True) for w in test: print(cds[w].format('fasta'), end='', file=test_file) print(cds[w].id , 'mrna', sep='\t', file=compare) for w in train: print(cds[w].format('fasta'), end='', file=train_cds) for w in Y_test: print(lnc[w].format('fasta'), end='', file=test_file) print(lnc[w].id, 'lnc', sep='\t', file=compare) keys = [w for w in keys if w not in X_test] print('new_set',len(keys)) n=n+1 |
Use an ensemble of variant callers to call variants from ATAC-seq data (v0.3.3)
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 | import sys import pysam import argparse import warnings import numpy as np import pandas as pd from pathlib import Path import matplotlib.pyplot as plt from sklearn.metrics import accuracy_score, precision_score from sklearn.metrics import roc_curve def phred(vals): """ apply the phred scale to the vals provided """ with np.errstate(divide='raise'): try: return -10*np.log10(1-vals) except FloatingPointError: return np.float64(30) return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30 def plot_line(lst, show_discards=False): plt.clf() roc = np.copy(lst.T) roc[1] = phred(roc[1]) max_val = phred(max(roc[2])/(max(roc[2])+1)) # discard inf or na cols inf_or_na_cols = np.isinf(roc).any(axis=0) | np.isnan(roc).any(axis=0) # warn the user if we're discarding the majority of points discarded = np.sum(inf_or_na_cols)/roc.shape[1]*100 if (not show_discards and discarded != 0) or discarded >= 50: warnings.warn("Discarding NaN or Inf points ({}% of points)".format(discarded)) roc = roc[:,~(inf_or_na_cols)] # perform a simple linear regression p = np.polyfit(roc[0], roc[1], 1) r_squared = 1 - (sum((roc[1] - (p[0] * roc[0] + p[1]))**2) / ((len(roc[1]) - 1) * np.var(roc[1], ddof=1))) p = np.poly1d(p) # plot the points and the line plt.scatter(roc[0], roc[1], color='r', label="_nolegend_") if max(roc[0]) <= 1: plt.xlabel("RF Probability") elif max(roc[0]) <= np.pi/2: plt.xlabel("Reverse Arcsin of RF Probability") else: plt.xlabel("Phred-Scaled RF Probability") plt.ylabel("Phred-Scaled Precision (QUAL)") plt.plot( roc[0], p(roc[0]), label=str(p)+"\nr-squared: "+str(round(r_squared, 2))+ \ ("\ndiscarded: "+str(int(discarded))+"%" if show_discards else "") ) plt.hlines(max_val, min(roc[0]), max(roc[0]), colors='g', linestyles='dashed') plt.legend(frameon=False, loc='lower right') plt.tight_layout() def eqbin_mean(grp, log=True, pseudo=True, discards_ok=False, inverse=False): if inverse: return np.arcsin(grp.mean()) else: if log: if discards_ok or not pseudo: return phred(grp).mean() else: return phred(grp.sum()/(len(grp) + pseudo)) else: return grp.mean() def tpr_probs(df, bins=15, eqbin=True, log=True, pseudo=True, discards_ok=False, inverse=False): """ bin the sites and calculate an accuracy (predicted positive value) for that bin """ # retrieve only predicted positives df = df[df['varca~CLASS:']>=0.5] if eqbin: bin_size = int(len(df)/bins) # create bins (ie each grp) and add a single false positive to it so we don't get Inf lst = np.array([ ( eqbin_mean(grp['varca~prob.1'], log, pseudo, discards_ok, inverse), precision_score( np.append(grp['varca~truth'].values, 0) if pseudo else grp['varca~truth'], np.append(grp['varca~CLASS:'].values, 1) if pseudo else grp['varca~CLASS:'] ), grp['varca~prob.1'].size ) for grp in (df.iloc[i:i+bin_size] for i in range(0,len(df)-bin_size+1,bin_size)) ]) else: if log: df = df.copy() df['varca~prob.1'] = phred(df['varca~prob.1']) start = phred(0.5) if log else 0.5 # get the end excluding inf values (in case log == True) end = df.loc[df['varca~prob.1'] != np.inf, 'varca~prob.1'].max() # create bins (ie each grp) and add a single false positive to it so we don't get Inf lst = np.array([ ( grp[1]['varca~prob.1'].mean(), precision_score( np.append(grp[1]['varca~truth'].values, 0) if pseudo else grp['varca~truth'], np.append(grp[1]['varca~CLASS:'].values, 1) if pseudo else grp['varca~CLASS:'] ), grp[1]['varca~prob.1'].size ) for grp in df.groupby(pd.cut(df['varca~prob.1'], pd.interval_range(start, end, bins))) ]) return lst def strip_type(caller): """ strip the -indel or -snp from the end of a caller name """ vartype = '' if caller.endswith('-snp'): caller = caller[:-len('-snp')] vartype = 'snp' elif caller.endswith('-indel'): caller = caller[:-len('-indel')] vartype = 'indel' # if there is still a dash, get everything after it i = caller.rfind('-') if i != -1: caller = caller[i+1:] return caller, vartype def isnan(val): return type(val) is float and np.isnan(val) def get_calls(prepared, callers=None, pretty=False): """ get the alleles in each row of prepared at the location (CHROM/POS) of loc when choosing an alt allele, choose from the callers in the order given """ # keep track of the contigs that we've seen contigs = set() # whether we've read the header yet read_header = False # iterate through each row in the df and check whether they match loc for chunk in prepared: # do some special stuff (parsing the header) on the very first iteration if not read_header: # if callers is None, retrieve the callers from the columns of the dataframe if callers is None: callers = [ caller[:-len('~ALT')] for caller in chunk.columns if caller.endswith('~ALT') and not caller.startswith('pg-') ] # what types of variants are we dealing with? let's count how many # times they appear in the caller names vartypes = {'snp': 0, 'indel': 0} # also, let's retrieve the callers as a dictionary pretty_callers = {} for caller in callers: pretty_caller, vartype = strip_type(caller) # don't beautify the callers if the user didn't request it pretty_callers[caller] = pretty_caller if pretty else caller # keep track of how often each vartype appears if vartype in vartypes: vartypes[vartype] += 1 callers = pretty_callers # retrieve the first CHROM/POS location and yield whether we are reading indels or snps loc, predict = yield max(vartypes, key=vartypes.get) read_header = True # now iterate through each row (and also convert the POS column to an int) for idx, row in chunk.iterrows(): # check if we already passed the row -- ie we either: # 1) moved onto a new contig or # 2) moved passed the position while ( idx[0] != loc[0] and loc[0] in contigs ) or ( idx[0] == loc[0] and idx[1] > loc[1] ): # return None if we couldn't find loc in the df loc, predict = yield None if idx == loc: # we found it! found = False # now, we must figure out which caller to get the alleles from for caller in callers: ref, alt = row[caller+"~REF"], row[caller+"~ALT"] # TODO: make this work for an arbitrary number of variant types for multilabel classification using the other CLASS values in classified # right now, this only works if there's a single binary label if not isnan(ref) and ( (isnan(alt) + predict) % 2 ): found = True break if found: loc, predict = yield callers[caller], ref, alt else: # if we know there is a variant here, but none of the other # callers found it, just label it as a non-variant # TODO: figure out the alt allele from inspecting the ref genome? loc, predict = yield 'varca', row['REF'], float('nan') # save the current contig so that we know which ones we've seen contigs.add(idx[0]) def prob2qual(prob, vartype): # values are from linear model that we created from using the "-i" option if vartype == 'snp': return 0.6237*phred(prob)+8.075 elif vartype == 'indel': return 0.8463*phred(prob)+2.724 else: # we shouldn't ever encounter this situation, but just in case... return phred(prob) def main(prepared, classified, callers=None, cs=1000, all_sites=False, pretty=False, vartype=None): """ use the results of the prepare pipeline and the classify pipeline to create a VCF with all of the classified sites """ # first, get a generator that can read through each call in the prepared df prepared = get_calls( pd.read_csv( prepared, sep="\t", header=0, index_col=["CHROM", "POS"], dtype=str, chunksize=cs, na_values=".", usecols=lambda col: col in ['CHROM', 'POS', 'REF'] or col.endswith('~REF') or col.endswith('~ALT') ), callers, pretty ) # flush out the first item in the generator: the vartype if vartype is None: vartype = next(prepared) else: # if the user already gave us the vartype, then just discard this next(prepared) # also retrieve the classifications as a df classified = pd.read_csv( classified, sep="\t", header=0, index_col=["CHROM", "POS"], dtype={'CHROM':str, 'POS':int}, chunksize=cs, na_values="." ) # keep track of how many sites in the classifications df we've had to skip skipped = 0 # keep track of how many sites we skipped but were predicted to have a variant no_alts = 0 # iterate through each site in the classifications df, get its alleles, and # then return them in a nice-looking dictionary for chunk in classified: for idx, row in chunk.iterrows(): try: # get the alleles for this CHROM/POS location call = prepared.send((idx, row['varca~CLASS:'])) except StopIteration: call = None # we found a variant but couldn't find an alternate allele! no_alts += not (call is None or (isnan(call[2]) + row['varca~CLASS:']) % 2) # check: does the site appear in the prepared pipeline? # and does this site have a variant? if call is None or (not all_sites and isnan(call[2])): skipped += 1 continue caller, ref, alt = call qual = prob2qual( row['varca~prob.'+str(int(not isnan(alt)))], vartype ) # construct a dictionary with all of the relevant details yield { 'contig': str(idx[0]), 'start': idx[1], 'stop': idx[1]+len(ref), 'qual': qual, 'alleles': (ref, "." if isnan(alt) else alt), 'info': {'CALLER':caller} } if skipped: warnings.warn( "Ignored {:n} classification sites that didn't have a variant.".format(skipped) ) if no_alts: warnings.warn( "Ignored {:n} sites that we predicted to have variants but didn't appear in any of the callers.".format(no_alts) ) def write_vcf(out, records): """ write the records to the output vcf """ vcf = pysam.VariantFile(args.out, mode='w') # write the necessary VCF header info vcf.header.info.add("CALLER", 1, 'String', "The caller from which this site was taken") contigs = set() for rec in records: # handle pysam increasing the start and end sites by 1 rec['start'] -= 1 rec['stop'] -= 1 # parse the record into a pysam.VariantRecord try: record = vcf.new_record( **rec, samples=None, id=None, filter=None ) except ValueError: # add the contig if it hasn't already been added if rec['contig'] not in contigs: vcf.header.contigs.add(rec['contig']) contigs.add(rec['contig']) else: raise # now, try again record = vcf.new_record( **rec, samples=None, id=None, filter=None ) # write the record to a file vcf.write(record) return vcf if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-o", "--out", default=sys.stdout, help="the filename to save the vcf (or bcf) to" ) parser.add_argument( "classified", type=Path, help="a sorted, results.tsv.gz file from the output of the classify pipeline" ) parser.add_argument( "prepared", type=Path, nargs="?", default=sys.stdin, help="a sorted, merge.tsv.gz file from the prepare pipeline (if not supplied, this is read from stdin)" ) parser.add_argument( '-c', "--callers", default=None, help="a comma separated list of the callers from which to choose alleles, supplied in order of priority (default: all of the callers in the file, in the order they appear)" ) parser.add_argument( '-s', "--chunksize", type=np.uint32, default=100000, help="how many rows to read into memory at once (default: 100,000)" ) parser.add_argument( '-a', '--all', action='store_true', help="whether to also write non-variant sites to create a gVCF (default: no)" ) parser.add_argument( '-p', '--pretty', action='store_true', help="should caller names appear in the vcf by their pretty form (with all dashes intelligently removed) or their original caller ID form? (default: the original form)" ) parser.add_argument( '-t', '--type', choices=['indel', 'snp'], default=None, help="whether to recalibrate QUAL values assuming your data are SNPs or indels (default: infer from callers)" ) parser.add_argument( '-i', '--internal', action='store_true', help="For testing and internal use: recalibrate the QUAL scores (assumes varca~truth column exists in classified)" ) args = parser.parse_args() callers = None if args.callers is not None: callers = args.callers.split(",") if not args.internal: import matplotlib matplotlib.use('Agg') vcf = write_vcf(args.out, main(args.prepared, args.classified, callers, args.chunksize, args.all, args.pretty, args.type)) else: if not sys.flags.interactive: sys.exit("ERROR: You must run this script in python's interactive mode (using python's -i flag) when providing the -i flag to this script.") try: df = pd.read_csv(args.classified, sep="\t", header=0, index_col=["CHROM", "POS"], usecols=['CHROM', 'POS', 'varca~truth', 'varca~prob.1', 'varca~CLASS:'], low_memory=False).sort_values(by='varca~prob.1') except ValueError: df = pd.read_csv(args.classified, sep="\t", header=0, index_col=["CHROM", "POS"], usecols=['CHROM', 'POS', 'breakca~truth', 'breakca~prob.1', 'breakca~CLASS:'], low_memory=False).sort_values(by='breakca~prob.1') df.columns = ['varca~truth', 'varca~prob.1', 'varca~CLASS:'] plt.ion() plot_line(tpr_probs(df)) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | import sys import argparse import numpy as np import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from sklearn.metrics import auc from itertools import product parser = argparse.ArgumentParser() parser.add_argument( "out", nargs="?", default=sys.stdout, help="the filename to save the data to" ) known_args, unknown_args = parser.parse_known_args() # dynamically parse whatever options the user passes us count = 0 for arg in unknown_args: if arg.startswith('--'): parser.add_argument('--{}'.format(arg[2:]), type=argparse.FileType('r')) count += 1 if count < 1: parser.error("Specify the path to at least one space separated table (w/o a header) with two rows (recall/precision) using options like --gatk-indel path/to/gatk-table") args = parser.parse_args() all_args = vars(args) def get_marker(): """ retrieve a unique marker in a deterministic order """ yield from product(range(2, 6), range(1, 3), [52]) # if that wasn't enough, create some more just at a different angle yield from product(range(2, 8), range(1, 3), [0]) # if we ever get to this point, we need more than 20 markers yield from product([2]+list(range(3, 6, 2)), range(1, 3), [90]) # initialize all of the colors callers = ['gatk_indel', 'varscan_indel', 'vardict_indel', 'breakca', 'delly', 'pindel', 'illumina_manta', 'illumina_strelka', 'pg_indel'] colors = dict(zip(callers, plt.cm.Set1(range(len(callers))))) # add the other callers, as well for k in list(colors.keys()): if k.endswith('_indel'): colors[k[:-len('indel')]+"snp"] = colors[k] elif k == 'breakca': colors['varca'] = colors[k] markers = get_marker() # go through each table and get its name from all of the args for arg in sorted(all_args.keys()): if arg not in known_args: table = np.loadtxt(all_args[arg]) # recall: 1st row, precision: 2nd row if table.ndim != 1: # area = auc(table[0], table[1]) # colors[arg] = plt.step( # table[0], table[1], where='post', # label=arg+": area={0:0.2f}".format(area) # )[0].get_color() if arg in colors: plt.step( table[0], table[1], where='post', color=colors[arg], label=arg.replace('breakca', 'varca') ) else: # pick the color randomly plt.step( table[0], table[1], where='post', label=arg.replace('breakca', 'varca') ) else: area = table[1] # check if this pt has a curve of the same name # to make sure they're the same color if arg.endswith("_pt") and arg[:-3] in colors: extra = {'color': colors[arg[:-3]]} else: extra = {} plt.plot( table[0], table[1], marker=next(markers), ms=12, label=arg.replace('breakca', 'varca')+": height={0:0.2f}".format(area), **extra ) plt.legend(bbox_to_anchor=(1.02, 1), loc="upper left", fontsize='small') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.gcf().set_size_inches(10, 10) plt.savefig(args.out, bbox_inches='tight', pad_inches=0.1, set_dpi=350) |
Snakemake workflow for comparison of differential abundance ranks (v0.3.0a2)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | import logging from joblib import dump import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import (roc_curve, auc, precision_recall_curve, average_precision_score) from sklearn.model_selection import RepeatedStratifiedKFold logger = logging.getLogger("qadabra") logger.setLevel(logging.INFO) fh = logging.FileHandler(snakemake.log[0], mode="w") formatter = logging.Formatter( f"[%(asctime)s - {snakemake.rule}] :: %(message)s" ) fh.setFormatter(formatter) logger.addHandler(fh) logging.captureWarnings(True) logging.getLogger("py.warnings").addHandler(fh) covariate = snakemake.params["factor_name"] target = snakemake.params["target"] logger.info(f"Covariate: {covariate}") logger.info(f"Target: {target}") metadata = pd.read_table(snakemake.input["metadata"], sep="\t", index_col=0).dropna(subset=[covariate]) try: log_ratios = pd.read_table(snakemake.input["log_ratios"], sep="\t", index_col=0).join(metadata, how="inner") X = log_ratios["log_ratio"].values.reshape(-1, 1) y = (log_ratios[covariate] == target).astype(int).values mean_fpr = np.linspace(0, 1, 100) mean_rec = np.linspace(0, 1, 100) n_splits = snakemake.config["ml_params"]["n_splits"] n_repeats = snakemake.config["ml_params"]["n_repeats"] logger.info( f"Using repeated stratified k-fold CV with {n_splits} splits & " f"{n_repeats} repeats" ) cv = RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=1) model = LogisticRegression(penalty="none") folds = [(train, test) for train, test in cv.split(X, y)] model_data = { "folds": { f"fold_{i+1}": {"train_idx": train, "test_idx": test} for i, (train, test) in enumerate(folds) } } model_data["fprs"] = mean_fpr model_data["recalls"] = mean_rec model_data["log_ratios"] = X model_data["truth"] = y # https://stackoverflow.com/a/67968945 tprs = [] precs = [] roc_aucs = [] pr_aucs = [] for i, (train, test) in enumerate(folds): prediction = model.fit(X[train], y[train]).predict_proba(X[test]) fpr, tpr, _ = roc_curve(y[test], prediction[:, 1]) tpr_interp = np.interp(mean_fpr, fpr, tpr) tprs.append(tpr_interp) roc_auc = auc(mean_fpr, tpr_interp) roc_aucs.append(roc_auc) prec, rec, _ = precision_recall_curve(y[test], prediction[:, 1]) prec = prec[::-1] rec = rec[::-1] prec_interp = np.interp(mean_rec, rec, prec) precs.append(prec_interp) pr_auc = auc(mean_rec, prec_interp) pr_aucs.append(pr_auc) logger.info( f"Fold {i+1}/{len(folds)}: ROC AUC = {roc_auc:.2f}, " f"PR AUC = {pr_auc:.2f}" ) model_data["tprs"] = tprs model_data["precisions"] = precs model_data["roc_aucs"] = roc_aucs model_data["pr_aucs"] = pr_aucs mean_roc_auc = np.mean(roc_aucs) mean_pr_auc = np.mean(pr_aucs) logger.info(f"Mean ROC AUC = {mean_roc_auc:.2f}") logger.info(f"Mean PR AUC = {mean_pr_auc:.2f}") dump(model_data, snakemake.output[0], compress=True) logger.info(f"Saving model to {snakemake.output[0]}") except: logger.error(f"The file {snakemake.input['log_ratios']} is empty or does not exist") model_data = {} dump(model_data, snakemake.output[0], compress=True) logger.info(f"Saving empty model to {snakemake.output[0]}") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | import logging import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler logger = logging.getLogger("qadabra") logger.setLevel(logging.INFO) fh = logging.FileHandler(snakemake.log[0], mode="w") formatter = logging.Formatter( f"[%(asctime)s - {snakemake.rule}] :: %(message)s" ) fh.setFormatter(formatter) logger.addHandler(fh) logger.info("Loading differentials...") diff_df = pd.read_table(snakemake.input[0], sep="\t", index_col=0) rank_df = ( diff_df.rank(ascending=False) .rename(columns={f"{x}": f"{x}_rank" for x in diff_df.columns}) .join(diff_df) ) tool_list = diff_df.columns.tolist() tool_rank_list = [x + "_rank" for x in tool_list] logger.info("Scaling values...") scaled_diff_values = StandardScaler().fit_transform(diff_df.values) pc_cols = [f"PC{x+1}" for x in range(len(tool_list))] logger.info("Running PCA...") pca = PCA(n_components=len(tool_list)).fit(scaled_diff_values) feat_df = pd.DataFrame( pca.transform(scaled_diff_values), index=diff_df.index, columns=pc_cols ).join(rank_df) max_vals = feat_df[pc_cols].max() tool_df = pd.DataFrame( np.dot(pca.components_, np.diag(max_vals)), index=diff_df.columns, columns=pc_cols ) tool_df.index.name = "tool" feat_df.to_csv(snakemake.output["features"], sep="\t", index=True) tool_df.to_csv(snakemake.output["tools"], sep="\t", index=True) logger.info(f"Saved feature PCs to {snakemake.output['features']}") logger.info(f"Saved tool PCs to {snakemake.output['tools']}") prop_exp = pca.explained_variance_ratio_ prop_exp = pd.Series(prop_exp, index=pc_cols) prop_exp.name = "proportion_var_explained" prop_exp.to_csv(snakemake.output["prop_exp"], sep="\t", index=True) logger.info(f"Saved explained variance to {snakemake.output['prop_exp']}") |
degree project, predict patient outcome with RNA-seq data
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | import pandas as pd import numpy as np import os from sksurv.linear_model import CoxnetSurvivalAnalysis from sksurv.preprocessing import OneHotEncoder from sksurv.metrics import ( as_concordance_index_ipcw_scorer, as_cumulative_dynamic_auc_scorer, as_integrated_brier_score_scorer, cumulative_dynamic_auc ) from sklearn.model_selection import GridSearchCV, KFold, train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler, MinMaxScaler event = snakemake.params[0] days = snakemake.params[1] def load_data(dataP, spInfo, features): data = pd.read_csv(dataP).set_index('ID') sample_info = pd.read_csv(spInfo) data.drop(['MEAN','VAR'], axis=1, inplace=True) data = data.T.reset_index().rename(columns={'index':'SAMPLE'}) data.columns.names = [None] data = data[['SAMPLE']+features] if 'RFi' in event: data = pd.merge(sample_info[['SAMPLE', 'RFi_days', 'RFi_event', 'RFS_all_event', 'age', 'treatGroup']], data, on='SAMPLE') data = data[data.RFi_days.notna()].reset_index(drop=True) data = data[data.apply(lambda x: False if (x['RFi_event'] == 0 and x['RFi_days'] < 600) or (x['RFi_event'] == False and x['RFS_all_event'] == True) else True, axis = 1)].reset_index(drop=True) data.drop(['RFS_all_event'], axis=1, inplace=True) else: data = pd.merge(sample_info[['SAMPLE', days, event, 'age', 'treatGroup']], data, on='SAMPLE') data = data[data[event].notna()].reset_index(drop=True) data = data[data[days].notna()].reset_index(drop=True) x, y = data.iloc[::, 3:], data.iloc[::, 1:3] treatment = x.treatGroup.astype('category') x.drop('treatGroup', axis=1, inplace=True) x = pd.DataFrame(StandardScaler().fit_transform(x), columns=x.columns).dropna(axis=1) x['treatGroup'] = treatment x = OneHotEncoder().fit_transform(x).replace(np.nan, 0) return x, y output = snakemake.output[0] method = output.split('_')[-2] if method == 'uni': features = pd.read_csv(os.path.join(os.path.dirname(output), f'{event.split("_")[0]}_uni_fdr.csv')) features = features[features['p.adjust']<0.05] features = features['fetures'].to_list() elif method == 'ens': features = pd.read_csv(os.path.join(os.path.dirname(output), f'{event.split("_")[0]}-best.feature')) features = features['Feature'].to_list() elif method == '1run': which = output.split('/')[0] dataset = lambda x: 'genes' if 'gs' in x else 'transcripts' features = pd.read_csv(os.path.join(os.path.dirname(output), f'{event.split("_")[0]}_1run_{which}_{dataset(output.split("/")[1])}_best_coefs.csv'), index_col=0).reset_index() features = features['index'].to_list() features = [x for x in features if x != 'age' and not x.startswith('treat')] else: features = [] x, y = load_data(snakemake.input[0], snakemake.input[1], features) os_type = {'names':('events', 'time'), 'formats':('?', '<f8')} y = np.array([(a, b) for a, b in zip(y[event].astype(bool), y[days])], dtype=os_type) xtrain, xval, ytrain, yval = train_test_split(x, y, test_size=0.2, random_state=954) lower, upper = np.percentile(pd.DataFrame(ytrain).time, [25, 75]) grids_times = np.arange(lower, upper + 1) cv = KFold(n_splits=3, shuffle=True, random_state=0) if method == 'uni': alphas = 10. ** np.linspace(-1.8, -1, 5) else: alphas = 10. ** np.linspace(-1.9, -1.5, 5) grids_cindex = GridSearchCV( as_concordance_index_ipcw_scorer(CoxnetSurvivalAnalysis(l1_ratio=0.9, tol=1e-15, max_iter=10000), tau=grids_times[-1]), param_grid={"estimator__alphas": [[v] for v in alphas]}, cv=cv, error_score=0.5, n_jobs=16).fit(xtrain, ytrain) cph = CoxnetSurvivalAnalysis(l1_ratio=0.9, alphas=[grids_cindex.best_params_["estimator__alphas"]]) cph.fit(xtrain, ytrain) best_coefs = pd.DataFrame(cph.coef_, index=xtrain.columns, columns=['coefficient']) non_zero = np.sum(best_coefs.iloc[:, 0] != 0) non_zero_coefs = best_coefs.query("coefficient != 0") va_times = np.arange(np.percentile(pd.DataFrame(yval).time, [5, 95])[0], np.percentile(pd.DataFrame(yval).time, [5, 95])[1], 60) cph_risk_scores = cph.predict(xval) auc, mean_auc = cumulative_dynamic_auc(ytrain, yval, cph_risk_scores, va_times) baseline_feature = [x for x in xtrain.columns if x == 'age' or x.startswith('treat')] base = CoxnetSurvivalAnalysis(l1_ratio=0.9, tol=1e-20, max_iter=10000, alphas=[0]) base.fit(xtrain[baseline_feature], ytrain) base_risk_scores = base.predict(xval[baseline_feature]) base_auc, base_mean = cumulative_dynamic_auc(ytrain, yval, base_risk_scores, va_times) with open(output, 'w') as fo: fo.write(f'## trainnig \n') fo.write(f'# mean cindex: {grids_cindex.cv_results_["mean_test_score"]} \n') fo.write(f'# best alpha: {grids_cindex.best_params_["estimator__alphas"]} \n') fo.write(f'## validation \n') fo.write(f'# cindex:{cph.score(xval, yval)} \n') fo.write(f'# features in: {len(features)}+age+treatment, features out: {non_zero} \n') fo.write(f'## baseline \n') fo.write(f'## features: {baseline_feature} \n') fo.write(f'# cindex: {base.score(xval[baseline_feature], yval)} \n') fo.write(f'# base_mean_auc: {base_mean} \n') pd.DataFrame({'AUC':auc, 'base':base_auc, 'time':va_times, 'mean':[mean_auc]*len(auc)}).to_csv(output, index=False, mode='a') |
sklearn
# ⚠️⚠️⚠️ Summary ⚠️⚠️⚠️ ⚠️⚠️⚠️ The **`sklearn` PyPI package is deprecated use `scikit-learn` instead** ⚠️⚠️⚠️ # How to fix the error for the main use cases - use `pip install scikit-learn` rather than `pip install sklearn` - replace `sklearn` by `scikit-learn` in your pip requirements files (`requirements.txt`, `setup.py,` `setup.cfg`, `Pipfile`, etc ...) - if the `sklearn` package is used by one of your dependencies it would be great if you take some time to track which package uses `sklearn` instead of `scikit-learn` and report it to their issue tracker - as a last resort, set the environment variable `SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True` to avoid this error # Brownout schedule The following table shows the dates and time windows, where an exception will be raised if you attempt to install the `sklearn` package from PyPI. | Dates | Window(s) | |---------------------------------------|--------------------------------| | 2022