Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
This is the snakemake pipeline in order to process RNAseq paired-end fastq files to analyze splicing events using MAJIQ software (https://majiq.biociphers.org/). The pipeline involves use of :
-
FastQC
-
Trimgalore
-
STAR alignment
-
DESEQ2
-
MAJIQ
-
Voila software (link to MAJIQ results)
All of this is wrap in the pipeline using python/bash/R scripts. Check workflow/rules and workflow/scripts to better understand how it processed.
To be able to reproduce results, the pipeline have been included in a singularity container. See : https://github.com/LabLuco/build_singularity_MAJIQ
Steps to run the pipeline
1. Pull img
Create a batch file to pull the img automatically. Example on the genotoul bioinfo servers :
2. Prepare the data
Put your RNA-seq paire-end raw_data (fastq.gz) in a folder. Same with an indexed genome with STAR. (check archive2/common/Elsa/ folder to get one)
3. Run the pipeline
Create a batch file to execute the singularity image to run the pipeline. Here is a minimal one.
#!/bin/bash
#SBATCH -t 03:00:00
#SBATCH --mem=20G
module load system/singularity-3.7.3
cd /home/eclaude/work
cp quality_align_majiq_latest.sif tmp_$1
mkdir $1
cd $1/
singularity run --bind $2:/softwares/snakemake_align_MAJIQ/resources/fastq --bind $3:/softwares/snakemake_align_MAJIQ/resources/genome ../tmp_$1
rm ../tmp_$1
This script allows you to :
-
load the singularity module
-
create a folder for the new analysis
-
copy/paste the singularity img to be able to run multiple analysis in parallel
-
run the singularity container with binded folders (raw data and genome)
The script needs arguments. The associated bash command line can be :
sbatch --error=name_test.err --output=name_test.err --job-name=name_test majiq.sh "name_test" "/home/eclaude/work/raw_data/" "/home/eclaude/work/genome/"
4. Get the results
All results will be stored in the newly created folder /snakemake_align_MAJIQ/results/. Check the Clean_AS_Event folder for clean results files.
Code Snippets
8 9 | script: "../scripts/alignment.py" |
12 13 14 15 16 17 18 19 20 21 22 23 | shell: """ mkdir -p ../../results/Diff_Exp/ bash ../scripts/get_count_from_star.sh {params.gtf} python3 ./../scripts/samplesheet_and_keep_gene_interest.py -control {params.control} -test {params.test} Rscript ./../scripts/deseq2.R mkdir -p ../../results/Clean_AS_Event/ mkdir -p ../../results/Clean_AS_Event/ES/ mkdir -p ../../results/Clean_AS_Event/A5SS/ mkdir -p ../../results/Clean_AS_Event/A3SS/ mkdir -p ../../results/Clean_AS_Event/IR/ """ |
19 20 | script: "../scripts/filter_events_from_voila_tsv.py" |
13 14 15 16 17 | shell: """ python3 ./../scripts/write_MAJIQ_configfile.py -g {params.genomename} -control {params.control} -test {params.test} majiq build {params.gff3} -c ../../resources/MAJIQ_conf/settings.ini -j 4 -o ../../results/MAJIQ/build_{params.control}_{params.test}/ --min-experiments {params.nbrep} """ |
10 11 | script: "../scripts/majiq_quantif_dpsi.py" |
11 12 | script: "../scripts/majiq_quantif_psi.py" |
6 7 8 9 10 11 | shell: """ rm -r -f ../../resources/fastq/ rm -r -f ../../resources/genome/ touch ../../results/finished.txt """ |
7 8 | script: "../scripts/samtools_index.py" |
9 10 | script: "../scripts/tpmcalculator.py" |
11 12 | script: "../scripts/trimgalore.py" |
9 10 11 12 | shell: """ voila tsv ../../results/MAJIQ/build_{params.control}_{params.test}/splicegraph.sql ../../results/MAJIQ/dPSI_{params.control}_{params.test}/{params.control}_{params.test}.deltapsi.voila -f ../../results/Voila/{params.control}_{params.test}.tsv --threshold 0.1 --probability-threshold 0.9 """ |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | import os import glob import subprocess def align(scriptdir,genomedir,namelist): for name in namelist : id = name.split('/')[-2] outfileprefix = scriptdir+'/../../results/alignment/'+id+'/'+id+'_' fwd = name+'*1.fq.gz' rev = name+'*2.fq.gz' starcommand = 'STAR --runThreadN 10 --quantMode GeneCounts --genomeDir '+genomedir+' --outFileNamePrefix '+outfileprefix+' --readFilesIn '+fwd+' '+rev+' --readFilesCommand zcat' subprocess.run((starcommand),stdout=subprocess.PIPE, stderr=subprocess.PIPE,universal_newlines=True,shell=True) if __name__ == "__main__": scriptdir = os.path.dirname(os.path.realpath(__file__)) genomedir = scriptdir+'/../../resources/genome/' namelist = glob.glob(scriptdir+'/../../results/trim_galore/*/') align(scriptdir,genomedir,namelist) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | setwd("./../../results/Diff_Exp/") library(SummarizedExperiment, lib.loc='/usr/lib/R/library') library(BiocGenerics, lib.loc='/usr/lib/R/library') library(GenomicRanges, lib.loc='/usr/lib/R/library') library(IRanges, lib.loc='/usr/lib/R/library') library(S4Vectors, lib.loc='/usr/lib/R/library') library(DESeq2, lib.loc='/usr/lib/R/library') sampletable <- read.table("samplesheet.tsv", header=T, sep="\t") rownames(sampletable) <- sampletable$sample_name tx2gene <- read.table("tx2gene.tsv",sep="\t",header=F) # read in the matrix count_matrix <- read.delim("raw_counts_matrix.tsv", header=T, sep="\t", row.names=1) # create the DESeq object se_star_matrix <- DESeqDataSetFromMatrix(countData = count_matrix, colData = sampletable, design = ~ condition) se_star2 <- DESeq(se_star_matrix) # normalized = TRUE: divide the counts by the size factors calculated by the DESeq function norm_counts <- log2(counts(se_star2, normalized = TRUE)+1) # add the gene symbols norm_counts_symbols <- merge(unique(tx2gene[,2:3]), data.frame(ID=rownames(norm_counts), norm_counts), by=1, all=F) # write normalized counts to text file write.table(norm_counts_symbols, "normalized_counts.tsv", quote=F, col.names=T, row.names=F, sep="\t") ###### VISUALIZATION ###### vsd <- varianceStabilizingTransformation(se_star2) ### samples correlation ### library(pheatmap) # calculate between-sample distance matrix sampleDistMatrix <- as.matrix(dist(t(assay(vsd)))) png("sample_distance_heatmap_star.png") pheatmap(sampleDistMatrix) dev.off() ### PCA ### png("PCA_star.png") plotPCA(object = vsd,intgroup = "condition") dev.off() ###### Diff Exp Analysis ###### se_star2$condition <- relevel(se_star2$condition, ref = "control") se_star2 <- nbinomWaldTest(se_star2) resultsNames(se_star2) # contrast: the column from the metadata that is used for the grouping of the samples, then the baseline and the group compared to the baseline de_shrink <- lfcShrink(dds = se_star2, coef="condition_test_vs_control", type="apeglm") de_symbols <- merge(unique(tx2gene[,2:3]), data.frame(ID=rownames(de_shrink), de_shrink), by=1, all=F) write.table(de_symbols, "deseq2_results.tsv", quote=F, col.names=T, row.names=F, sep="\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 | import pandas import numpy as np import re import os import glob # LSV TYPE specified if it is about a target or a source exon such as : s|... or t|... # LSV TYPE may be composed of various splicing events defined as : AeB.CoD # LSV TYPE may contain an intro which is specified at the end such as : ...|i def main(): scriptdir = os.path.dirname(os.path.realpath(__file__)) control = snakemake.params[0] test = snakemake.params[1] voilafile = scriptdir+'/../../results/Voila/'+control+'_'+test+'.tsv' voilatsv = pandas.read_csv(voilafile, header=0, comment='#', sep='\t') deseqfile = scriptdir+'/../../results/Diff_Exp/deseq2_results.tsv' deseq = pandas.read_csv(deseqfile,header=0,sep='\t') majiqfiles = glob.glob(scriptdir+'/../../results/MAJIQ/build_'+control+'_'+test+'/*.majiq') fulldf,fulldfir = extract_events(voilatsv,control,test,deseq) fulldf1 = remove_low_dpsi_and_proba(fulldf,0.1,0.9) fulldf2 = remove_low_dpsi_and_proba(fulldf,0.2,0.9) fulldfir1 = remove_low_dpsi_and_proba(fulldfir,0.1,0.9) fulldfir2 = remove_low_dpsi_and_proba(fulldfir,0.2,0.9) allrep,controlcol,testcol = explore_majiq_files(control,test,majiqfiles) fulldf1 = add_reads(control,test,allrep,controlcol,testcol,fulldf1) fulldf2 = add_reads(control,test,allrep,controlcol,testcol,fulldf2) fulldfir1 = add_reads(control,test,allrep,controlcol,testcol,fulldfir1,'IR') fulldfir2 = add_reads(control,test,allrep,controlcol,testcol,fulldfir2,'IR') outputdir = scriptdir+'/../../results/Clean_AS_Event/' get_only_X_event(control,test,fulldf1,fulldf2,'ES',outputdir) get_only_X_event(control,test,fulldf1,fulldf2,'A5SS',outputdir) get_only_X_event(control,test,fulldf1,fulldf2,'A3SS',outputdir) get_only_X_event(control,test,fulldfir1,fulldfir2,'IR',outputdir) def initdf(cond1,cond2): newdf = pandas.DataFrame(columns=['gene_name','gene_id','lsv_id', 'mean_dpsi_per_lsv_junction','probability_changing', 'probability_non_changing',cond1+'_mean_psi', cond2+'_mean_psi','de_novo_junctions', 'strand','place_constitutive_exon','seqid','junctions_coords','skipped_exons_coords', 'ES','A5SS','A3SS']) return newdf def initdfIR(cond1,cond2): newdf = pandas.DataFrame(columns=['gene_name','gene_id','lsv_id', 'mean_dpsi_per_lsv_junction','probability_changing', 'probability_non_changing',cond1+'_mean_psi', cond2+'_mean_psi','de_novo_junctions', 'strand','place_constitutive_exon','seqid','junctions_coords', 'ir_coords','IR']) return newdf def extract_events(voilatsv,cond1,cond2,deseq): fulldf = initdf(cond1,cond2) fulldfir = initdfIR(cond1,cond2) for index,row in voilatsv.iterrows(): lsvtype = row['lsv_type'] lsvsplit = lsvtype.split('|') target = lsvsplit[0] if 'na' in lsvsplit : continue if target == 't' : listB = [] listC = [] maxD = 1 for event in lsvsplit : if event not in ['t','i']: b = int(re.search('e(.*)\.',event).group(1)) listB.append(b) d = int(re.search('o(.*)',event).group(1)) if d > maxD : maxD = d c = int(re.search('\.(.*)o',event).group(1)) listC.append(c) maxB = max(listB) for index in range (1,len(lsvsplit)) : if index == len(lsvsplit)-1 and lsvsplit[index] == 'i': dfevent = initdfIR(cond1,cond2) dfevent.loc[0,'ir_coords'] = row['ir_coords'] dfevent.loc[0,'IR'] = 'TRUE' else : dfevent = initdf(cond1,cond2) dfevent.loc[0,'ES'] = 'FALSE' # INIT VALUE dfevent.loc[0,'A5SS'] = 'FALSE' # INIT VALUE dfevent.loc[0,'A3SS'] = 'FALSE' # INIT VALUE dfevent.loc[0,'gene_name'] = row['gene_name'] dfevent.loc[0,'gene_id'] = row['gene_id'] dfevent.loc[0,'lsv_id'] = row['lsv_id'] dfevent.loc[0,'mean_dpsi_per_lsv_junction'] = row['mean_dpsi_per_lsv_junction'].split(';')[index-1] dfevent.loc[0,'probability_changing'] = row['probability_changing'].split(';')[index-1] dfevent.loc[0,'probability_non_changing'] = row['probability_non_changing'].split(';')[index-1] dfevent.loc[0,cond1+'_mean_psi'] = row[cond1+'_mean_psi'].split(';')[index-1] dfevent.loc[0,cond2+'_mean_psi'] = row[cond2+'_mean_psi'].split(';')[index-1] dfevent.loc[0,'de_novo_junctions'] = row['de_novo_junctions'].split(';')[index-1] # A CHANGER EN BOOL PLUS TARD dfevent.loc[0,'strand'] = row['strand'] dfevent.loc[0,'place_constitutive_exon'] = 'TARGET' dfevent.loc[0,'seqid'] = row['seqid'] dfevent.loc[0,'junctions_coords'] = row['junctions_coords'].split(';')[index-1] if lsvsplit[index] != 'i': a = int(re.search('^(.*)e',lsvsplit[index]).group(1)) b = int(re.search('e(.*)\.',lsvsplit[index]).group(1)) c = int(re.search('\.(.*)o',lsvsplit[index]).group(1)) d = int(re.search('o(.*)',lsvsplit[index]).group(1)) diffc = False indexB = [i for i,value in enumerate(listB) if value==b] for i in indexB : if listC[i] != c : diffc = True if b != maxB : dfevent.loc[0,'ES'] = 'TRUE' skippedex = [row['seqid']+':' + s for s in row['exons_coords'].split(';')[1:-b]] dfevent.loc[0,'skipped_exons_coords'] = ' '.join(skippedex) if d != 1 and c != maxD and listB.count(b) > 1 and diffc == True : dfevent.loc[0,'A5SS'] = 'TRUE' if a != 1 : dfevent.loc[0,'A3SS'] = 'TRUE' if index == len(lsvsplit)-1 and lsvsplit[index] == 'i': fulldfir = pandas.concat([fulldfir,dfevent]) else : fulldf = pandas.concat([fulldf,dfevent]) elif target == 's': maxA = 1 listB = [] listC = [] for event in lsvsplit : if event not in ['s','i']: a = int(re.search('(.*)e',event).group(1)) if a > maxA : maxA = a b = int(re.search('e(.*)\.',event).group(1)) listB.append(b) c = int(re.search('\.(.*)o',event).group(1)) listC.append(c) for index in range (1,len(lsvsplit)) : if index == len(lsvsplit)-1 and lsvsplit[index] == 'i': dfevent = initdfIR(cond1,cond2) dfevent.loc[0,'ir_coords'] = row['ir_coords'] dfevent.loc[0,'IR'] = 'TRUE' else : dfevent = initdf(cond1,cond2) dfevent.loc[0,'ES'] = 'FALSE' # INIT VALUE dfevent.loc[0,'A5SS'] = 'FALSE' # INIT VALUE dfevent.loc[0,'A3SS'] = 'FALSE' # INIT VALUE dfevent.loc[0,'gene_name'] = row['gene_name'] dfevent.loc[0,'gene_id'] = row['gene_id'] dfevent.loc[0,'lsv_id'] = row['lsv_id'] dfevent.loc[0,'mean_dpsi_per_lsv_junction'] = row['mean_dpsi_per_lsv_junction'].split(';')[index-1] dfevent.loc[0,'probability_changing'] = row['probability_changing'].split(';')[index-1] dfevent.loc[0,'probability_non_changing'] = row['probability_non_changing'].split(';')[index-1] dfevent.loc[0,cond1+'_mean_psi'] = row[cond1+'_mean_psi'].split(';')[index-1] dfevent.loc[0,cond2+'_mean_psi'] = row[cond2+'_mean_psi'].split(';')[index-1] dfevent.loc[0,'de_novo_junctions'] = row['de_novo_junctions'].split(';')[index-1] # A CHANGER EN BOOL PLUS TARD ? dfevent.loc[0,'strand'] = row['strand'] dfevent.loc[0,'place_constitutive_exon'] = 'SOURCE' dfevent.loc[0,'seqid'] = row['seqid'] dfevent.loc[0,'junctions_coords'] = row['junctions_coords'].split(';')[index-1] if lsvsplit[index] != 'i': a = int(re.search('^(.*)e',lsvsplit[index]).group(1)) b = int(re.search('e(.*)\.',lsvsplit[index]).group(1)) c = int(re.search('\.(.*)o',lsvsplit[index]).group(1)) d = int(re.search('o(.*)',lsvsplit[index]).group(1)) diffc = False indexB = [i for i,value in enumerate(listB) if value==b] for i in indexB : if listC[i] != c : diffc = True if b != 1 : dfevent.loc[0,'ES'] = 'TRUE' skippedex = [row['seqid']+':'+ s for s in row['exons_coords'].split(';')[1:b]] dfevent.loc[0,'skipped_exons_coords'] = ' '.join(skippedex) if a != maxA : dfevent.loc[0,'A5SS'] = 'TRUE' if d != 1 and c != 1 and listB.count(b) > 1 and diffc == True: dfevent.loc[0,'A3SS'] = 'TRUE' if index == len(lsvsplit)-1 and lsvsplit[index] == 'i': fulldfir = pandas.concat([fulldfir,dfevent]) else : fulldf = pandas.concat([fulldf,dfevent]) fulldf['junction_coords'] = fulldf[['seqid', 'junctions_coords']].agg(':'.join, axis=1) fulldfir['junction_coords'] = fulldfir[['seqid', 'junctions_coords']].agg(':'.join, axis=1) fulldf = fulldf.drop(['seqid', 'junctions_coords'], axis=1) fulldfir = fulldfir.drop(['seqid', 'junctions_coords'], axis=1) deseq = deseq.drop(['V3'],axis=1) fulldf = pandas.merge(fulldf,deseq,how='inner',left_on='gene_id',right_on='V2') fulldfir = pandas.merge(fulldfir,deseq,how='inner',left_on='gene_id',right_on='V2') fulldf = fulldf.drop(['V2'],axis=1) fulldfir = fulldfir.drop(['V2'],axis=1) fulldf = fulldf[~fulldf.skipped_exons_coords.str.contains("nan",na=False)] return fulldf,fulldfir def remove_low_dpsi_and_proba(fulldf,threshholddpsi,threshholdproba): fulldf['mean_dpsi_per_lsv_junction'] = fulldf['mean_dpsi_per_lsv_junction'].astype(float) fulldf['probability_changing'] = fulldf['probability_changing'].astype(float) fulldf = fulldf[fulldf['mean_dpsi_per_lsv_junction'].abs() > threshholddpsi] fulldf = fulldf[fulldf['probability_changing'] > threshholdproba] return fulldf def remove_duplicates(df): df = (df.assign(abs=df['mean_dpsi_per_lsv_junction'].abs()).sort_values(['junction_coords','abs'],ascending=[True, True]).drop('abs', 1)) df = df.drop_duplicates(subset=['gene_name', 'junction_coords'], keep='first') return df def explore_majiq_files(control,test,majiqfiles): controllist = [] testlist = [] allrep = {} ## get all events and their reads in one dic ## for file in majiqfiles : namerep = file.split('/')[-1].split('.majiq')[0] arrayrep = np.load(file)['junc_info'] dfrep = pandas.DataFrame.from_records(arrayrep,columns=[]) dfrep.columns = ['lsv_id','start','end','reads_'+namerep,'positions'] dfrep['lsv_id'] = dfrep['lsv_id'].map(lambda x: x.decode('UTF-8')) # dfrep['reads_'+namerep] = dfrep['reads_'+namerep].astype(int).astype(str) # print(dfrep['reads_'+namerep].dtype) allrep[namerep] = dfrep if '_control' in namerep and re.match('^'+control+'_', namerep): controllist.append(namerep) elif re.match('^'+test+'_', namerep) : testlist.append(namerep) controlcol = [] testcol = [] for namectrl in controllist : controlcol.append('reads_'+namectrl) for nametest in testlist : testcol.append('reads_'+nametest) return allrep,controlcol,testcol def add_reads(control,test,allrep,controlcol,testcol,finaldf,event='notIR'): finaldf['junction_coords_simple'] = finaldf['junction_coords'].str.replace("^chr.*:", "") finaldftotest = finaldf adaptallrep = {} ## keep only events of interest by comparing to the new df built ## for key in allrep: adaptallrep[key] = allrep[key].merge(finaldftotest,on=['lsv_id']) adaptallrep[key]['start'] = adaptallrep[key]['start'].apply(str) adaptallrep[key]['end'] = adaptallrep[key]['end'].apply(str) adaptallrep[key]['junction_test'] = adaptallrep[key][['start', 'end']].agg('-'.join, axis=1) adaptallrep[key] = adaptallrep[key][adaptallrep[key].apply(lambda x: x['junction_test'] in x['junction_coords'], axis=1)] adaptallrep[key] = adaptallrep[key][(adaptallrep[key]['lsv_id'].isin(adaptallrep[key]['lsv_id']))] if event == 'notIR' : adaptallrep[key] = adaptallrep[key].drop(['start', 'end', 'positions', 'gene_name', 'gene_id', 'mean_dpsi_per_lsv_junction', 'probability_changing', 'probability_non_changing', control+'_mean_psi', test+'_mean_psi', 'de_novo_junctions', 'strand', 'place_constitutive_exon', 'ES', 'A5SS', 'A3SS', 'skipped_exons_coords', 'junction_coords', 'baseMean', 'log2FoldChange', 'lfcSE', 'pvalue', 'padj','junction_coords_simple'],axis=1) elif event == 'IR' : adaptallrep[key] = adaptallrep[key].drop(['start', 'end', 'positions', 'gene_name', 'gene_id', 'mean_dpsi_per_lsv_junction', 'probability_changing', 'probability_non_changing', control+'_mean_psi', test+'_mean_psi', 'de_novo_junctions', 'strand', 'place_constitutive_exon', 'ir_coords', 'IR', 'junction_coords','baseMean', 'log2FoldChange', 'lfcSE', 'pvalue', 'padj','junction_coords_simple'],axis=1) finaldf = pandas.merge(finaldf,adaptallrep[key],right_on=['lsv_id','junction_test'],left_on=['lsv_id','junction_coords_simple'],how='left') finaldf = finaldf.drop(['junction_test'],axis=1) finaldf[controlcol] = finaldf[controlcol].astype(str) finaldf[testcol] = finaldf[testcol].astype(str) finaldf['control_reads'] = finaldf[controlcol].agg('::'.join, axis=1) finaldf['test_reads'] = finaldf[testcol].agg('::'.join, axis=1) finaldf['reads_per_junctions_per_replicates(control|test)'] = finaldf[['control_reads', 'test_reads']].agg('|'.join, axis=1) finaldf = finaldf.drop(controlcol,axis=1) finaldf = finaldf.drop(testcol,axis=1) finaldf = finaldf.drop(['control_reads','test_reads','junction_coords_simple'],axis=1) if event == 'notIR' : finaldf = finaldf[['gene_name', 'gene_id', 'lsv_id', 'mean_dpsi_per_lsv_junction', 'probability_changing', 'probability_non_changing', control+'_mean_psi', test+'_mean_psi', 'reads_per_junctions_per_replicates(control|test)', 'de_novo_junctions', 'strand', 'place_constitutive_exon', 'ES', 'A5SS', 'A3SS', 'skipped_exons_coords', 'junction_coords', 'baseMean', 'log2FoldChange', 'lfcSE', 'pvalue', 'padj']] elif event == 'IR' : finaldf = finaldf[['gene_name', 'gene_id', 'lsv_id', 'mean_dpsi_per_lsv_junction', 'probability_changing', 'probability_non_changing', control+'_mean_psi', test+'_mean_psi', 'reads_per_junctions_per_replicates(control|test)', 'de_novo_junctions', 'strand', 'place_constitutive_exon', 'IR', 'ir_coords', 'junction_coords', 'baseMean', 'log2FoldChange', 'lfcSE', 'pvalue', 'padj']] return finaldf def get_only_X_event(control,test,fulldf1,fulldf2,eventtype,outputdir): if eventtype == 'ES' : dfES1 = fulldf1[fulldf1['ES'] == 'TRUE'] dfES1 = remove_duplicates(dfES1) dfES1.to_csv(outputdir+'ES/'+control+'_'+test+'_ES_01.tsv',header=1, sep='\t',index=False) dfES2 = fulldf2[fulldf2['ES'] == 'TRUE'] dfES2 = remove_duplicates(dfES2) dfES2.to_csv(outputdir+'ES/'+control+'_'+test+'_ES_02.tsv',header=1, sep='\t',index=False) elif eventtype == 'A5SS' : dfA5SS1 = fulldf1[fulldf1['A5SS'] == 'TRUE'] dfA5SS1 = remove_duplicates(dfA5SS1) dfA5SS1.to_csv(outputdir+'A5SS/'+control+'_'+test+'_A5SS_01.tsv',header=1, sep='\t',index=False) dfA5SS2 = fulldf2[fulldf2['A5SS'] == 'TRUE'] dfA5SS2 = remove_duplicates(dfA5SS2) dfA5SS2.to_csv(outputdir+'A5SS/'+control+'_'+test+'_A5SS_02.tsv',header=1, sep='\t',index=False) elif eventtype == 'A3SS' : dfA3SS1 = fulldf1[fulldf1['A3SS'] == 'TRUE'] dfA3SS1 = remove_duplicates(dfA3SS1) dfA3SS1.to_csv(outputdir+'A3SS/'+control+'_'+test+'_A3SS_01.tsv',header=1, sep='\t',index=False) dfA3SS2 = fulldf2[fulldf2['A3SS'] == 'TRUE'] dfA3SS2 = remove_duplicates(dfA3SS2) dfA3SS2.to_csv(outputdir+'A3SS/'+control+'_'+test+'_A3SS_02.tsv',header=1, sep='\t',index=False) elif eventtype == 'IR' : dfIR1 = fulldf1[fulldf1['IR'] == 'TRUE'] dfIR1 = remove_duplicates(dfIR1) dfIR1.to_csv(outputdir+'IR/'+control+'_'+test+'_IR_01.tsv',header=1, sep='\t',index=False) dfIR2 = fulldf2[fulldf2['IR'] == 'TRUE'] dfIR2 = remove_duplicates(dfIR2) dfIR2.to_csv(outputdir+'IR/'+control+'_'+test+'_IR_02.tsv',header=1, sep='\t',index=False) if __name__ == "__main__": main() |
2 3 4 5 6 7 8 9 10 11 12 | gtf=$1 paste ../../results/alignment/*/*_ReadsPerGene.out.tab | grep -v "_" | awk '{{printf "%s\t", $1}}{{for (i=4;i<=NF;i+=4) printf "%s\t", $i; printf "\n" }}' > ../../results/Diff_Exp/tmp echo -n -e "gene_name\t" > ../../results/Diff_Exp/tmpheader| for file in ../../results/alignment/*/*_ReadsPerGene.out.tab; do basename $file _ReadsPerGene.out.tab | tr '\n' '\t' >> ../../results/Diff_Exp/tmpheader ; done echo -n -e "\n" >> ../../results/Diff_Exp/tmpheader sed -i -e '1 { r ../../results/Diff_Exp/tmpheader' -e 'N; }' ../../results/Diff_Exp/tmp awk -F'\t' 'BEGIN { OFS = FS }; NF { NF -= 1}; 1' < ../../results/Diff_Exp/tmp > ../../results/Diff_Exp/tmp2 mv ../../results/Diff_Exp/tmp2 ../../results/Diff_Exp/raw_counts_matrix.tsv rm ../../results/Diff_Exp/tmpheader rm ../../results/Diff_Exp/tmp rm ../../results/Diff_Exp/tmp2 cat ${gtf} | awk -F "\t" 'BEGIN{{OFS="\t"}}{{if($3=="transcript"){{split($9, a, "\""); print a[4],a[2],a[8]}}}}' > ../../results/Diff_Exp/tx2gene.tsv |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import os import glob import subprocess import re def deltapsi(scriptdir,majiqlist): outputdir = scriptdir+'/../../results/MAJIQ/dPSI_'+snakemake.params[0]+'_'+snakemake.params[1]+'/' controllist = [] testlist = [] for i in majiqlist : if re.match('^'+snakemake.params[0]+'_', i.split('/')[-1]): controllist.append(i) elif re.match('^'+snakemake.params[1]+'_', i.split('/')[-1]) : testlist.append(i) controllist = ' '.join(controllist) testlist = ' '.join(testlist) dpsicommand = 'majiq deltapsi -grp1 '+controllist+' -grp2 '+testlist+' -j 10 --min-experiments '+str(snakemake.params[2])+' -o '+outputdir+' -n '+snakemake.params[0]+' '+snakemake.params[1] dpsirun = subprocess.Popen(dpsicommand, shell=True, stdout=subprocess.PIPE) dpsirun.communicate() if __name__ == "__main__": scriptdir = os.path.dirname(os.path.realpath(__file__)) majiqlist = glob.glob(scriptdir+'/../../results/MAJIQ/build_'+snakemake.params[0]+'_'+snakemake.params[1]+'/*.majiq') deltapsi(scriptdir,majiqlist) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | import os import glob import subprocess import re def psi(scriptdir,majiqlist): outputdir1 = scriptdir+'/../../results/MAJIQ/PSI_'+snakemake.params[0]+'/' outputdir2 = scriptdir+'/../../results/MAJIQ/PSI_'+snakemake.params[1]+'/' controllist = [] testlist = [] for i in majiqlist : if re.match('^'+snakemake.params[0]+'_', i.split('/')[-1]): controllist.append(i) if re.match('^'+snakemake.params[1]+'_', i.split('/')[-1]): testlist.append(i) ## psi per replicates for rep1 in controllist : rep1name = rep1.split('/')[-1].split('.')[0] psi1command = 'majiq psi '+rep1+' -n '+rep1name+' -o '+outputdir1+rep1name+'/' psi1run = subprocess.Popen(psi1command, shell=True, stdout=subprocess.PIPE) psi1run.communicate() for rep2 in testlist : rep2name = rep2.split('/')[-1].split('.')[0] psi2command = 'majiq psi '+rep2+' -n '+rep2name+' -o '+outputdir2+rep2name+'/' psi2run = subprocess.Popen(psi2command, shell=True, stdout=subprocess.PIPE) psi2run.communicate() controllist = ' '.join(controllist) testlist = ' '.join(testlist) ## Mean psi psi1command = 'majiq psi -o '+outputdir1+' -j 10 -n '+snakemake.params[0]+'_mean --min-experiments '+str(snakemake.params[2])+' '+controllist psi1run = subprocess.Popen(psi1command, shell=True, stdout=subprocess.PIPE) psi1run.communicate() psi2command = 'majiq psi -o '+outputdir2+' -j 10 -n '+snakemake.params[1]+'_mean --min-experiments '+str(snakemake.params[2])+' '+testlist psi2run = subprocess.Popen(psi2command, shell=True, stdout=subprocess.PIPE) psi2run.communicate() if __name__ == "__main__": scriptdir = os.path.dirname(os.path.realpath(__file__)) majiqlist = glob.glob(scriptdir+'/../../results/MAJIQ/build_'+snakemake.params[0]+'_'+snakemake.params[1]+'/*.majiq') psi(scriptdir,majiqlist) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | import glob import re import pandas as pd import os import argparse def write_samplesheet(scriptdir,totest): samples = {'sample_name':[],'condition':[]} samplesfile = glob.glob(scriptdir+'/../../results/alignment/*/*_ReadsPerGene.out.tab') for file in samplesfile : samples['sample_name'].append(file.split('/')[-1].split('_ReadsPerGene.out.tab')[0]) if 'control' in file.split('/')[-1].split('_ReadsPerGene.out.tab')[0] : samples['condition'].append('control') else : samples['condition'].append('test') samplesdf = pd.DataFrame.from_dict(samples) order = totest.columns.tolist()[1:] samplesdf=samplesdf.set_index(['sample_name']).reindex(order).reset_index() samplesdf.to_csv(scriptdir+'/../../results/Diff_Exp/samplesheet.tsv',header=1,sep='\t',index=False) def extract_gene_interest(scriptdir,interest,totest): exist = ~totest['gene_name'].isin(interest['gene_id']) totest.drop(totest[exist].index, inplace = True) totest.to_csv(scriptdir+'/../../results/Diff_Exp/raw_counts_matrix.filtered.tsv',header=1,sep='\t',index=False) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-control',help='string for control condition name') parser.add_argument('-test',help='string for test condition name') args = parser.parse_args() control = args.control test = args.test scriptdir = os.path.dirname(os.path.realpath(__file__)) interest = pd.read_csv(scriptdir+'/../../results/Voila/'+control+'_'+test+'.tsv',header=0,sep='\t',comment='#') totest = pd.read_csv(scriptdir+'/../../results/Diff_Exp/raw_counts_matrix.tsv',header=0,sep='\t') write_samplesheet(scriptdir,totest) extract_gene_interest(scriptdir,interest,totest) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import os import glob import subprocess def index(scriptdir,namelist): for name in namelist : newname = name[:-4]+'.sorted.bam' samtoolssort = 'samtools sort '+name+' > '+newname sort = subprocess.Popen(samtoolssort, shell=True, stdout=subprocess.PIPE) sort.communicate() sortedbam = newname samtoolsindex = 'samtools index -b '+newname index = subprocess.Popen(samtoolsindex, shell=True, stdout=subprocess.PIPE) index.communicate() if __name__ == "__main__": scriptdir = os.path.dirname(os.path.realpath(__file__)) namelist = glob.glob(scriptdir+'/../../results/alignment/*/*_Aligned.out.sam') index(scriptdir,namelist) |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | import os import glob import subprocess def tpmcalculator(scriptdir,bamlist): outputdir = scriptdir+'/../../results/TPM/' for bam in bamlist : id = bam.split('/')[-2] tpmcommand = 'TPMCalculator -b '+bam+' -g '+snakemake.params[0]+' -q 255 -e' print(tpmcommand) tpm = subprocess.Popen(tpmcommand, shell=True, stdout=subprocess.PIPE) tpm.communicate() mkdircommand = 'mkdir '+outputdir+id+'/' mkdir = subprocess.Popen(mkdircommand, shell=True, stdout=subprocess.PIPE) mkdir.communicate() mvcommand = 'mv *sorted_* '+outputdir+id+'/' mv = subprocess.Popen(mvcommand, shell=True, stdout=subprocess.PIPE) mv.communicate() if __name__ == "__main__": scriptdir = os.path.dirname(os.path.realpath(__file__)) bamlist = glob.glob(scriptdir+'/../../results/alignment/*/*_Aligned.out.sorted.bam') tpmcalculator(scriptdir,bamlist) |
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | import os import glob import subprocess def execute_trim(rep,control,test,scriptdir): fastqlist = glob.glob(scriptdir+'/../../resources/fastq/*.fastq.gz') samples = {control :{}, test:{}} for i in range(0,rep): samples[control]['REP'+str(i+1)] = [] samples[test]['REP'+str(i+1)] = [] for file in fastqlist : id = file.split('/')[-1].split('.')[0] condition = id.split('_')[0] for i in range(0,rep): if '_REP'+str(i+1)+'_' in id and id[-2:] in ['R1','R2']: samples[condition]['REP'+str(i+1)].append(file) for exp in samples.keys(): for namerep in samples[exp] : id = samples[exp][namerep][0].split('/')[-1].split('.')[0][:-3] print(samples[exp][namerep][0]) if '_R1.f' in samples[exp][namerep][0] : fwd = samples[exp][namerep][0] rev = samples[exp][namerep][1] elif '_R2.f' in samples[exp][namerep][0] : fwd = samples[exp][namerep][1] rev = samples[exp][namerep][0] run = subprocess.Popen(('trim_galore --paired --fastqc --cores 10 --output_dir '+scriptdir+'/../../results/trim_galore/'+id+' '+fwd+' '+rev),shell=True, stdout=subprocess.PIPE) run.communicate() if __name__ == "__main__": scriptdir = os.path.dirname(os.path.realpath(__file__)) execute_trim(snakemake.params[0],snakemake.params[1],snakemake.params[2],scriptdir) |
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import argparse import os import glob import re import yaml def write_majiq_configfile(scriptdir,genome,control,test): bamlist = glob.glob(scriptdir+'/../../results/alignment/*/*_Aligned.out.sorted.bam') bamdirlist=[] for i, path in enumerate(bamlist): bamdirlist.append('/'.join(bamlist[i].split('/')[:-1])+'/') for i, bam in enumerate(bamlist) : bamlist[i] = bam.split('/')[-1].split('.bam')[0] bamdirlist = ','.join(bamdirlist) # exps = [] # with open(scriptdir+'/../../config/config.yaml','r') as snakemakeconfig : # lines = snakemakeconfig.read().splitlines() # for line in lines : # if re.search('Control: ', line) or re.search('Test: ', line): # exp = line.split(': ')[1] # exps.append(exp) bamcontrol = [] bamtest = [] for id in bamlist : if re.match('^'+control+'_', id): bamcontrol.append(id) elif re.match('^'+test+'_', id) : bamtest.append(id) bamcontrol = ','.join(bamcontrol) bamtest = ','.join(bamtest) with open(scriptdir+'/../../resources/MAJIQ_conf/settings.ini','w') as file: file.write('[info]\n') file.write('bamdirs='+bamdirlist+'\n') file.write('genome='+genome+'\n') file.write('[experiments]\n') file.write(control+'='+bamcontrol+'\n') file.write(test+'='+bamtest) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-g", help="Name of genome used for mapping") parser.add_argument("-control", help="Name of control condition") parser.add_argument("-test", help="Name of test condition") args = parser.parse_args() scriptdir = os.path.dirname(os.path.realpath(__file__)) write_majiq_configfile(scriptdir,args.g,args.control,args.test) |
Support
- Future updates