RNA-Seq pipeline
Here we provide the tools to perform paired end or single read RNA-Seq analysis including raw data quality control, differential expression (DE) analysis and functional annotation. As input files you may use either zipped fastq-files (.fastq.gz) or mapped read data (.bam files). In case of paired end reads, corresponding fastq files should be named using .R1.fastq.gz and .R2.fastq.gz suffixes.
Pipeline Workflow
All analysis steps are illustrated in the pipeline flowchart . Specify the desired analysis details for your data in the essential.vars.groovy file (see below) and run the pipeline rnaseq.pipeline.groovy as described here . A markdown file DEreport.Rmd will be generated in the output reports folder after running the pipeline. Subsequently, the DEreport.Rmd file can be converted to a final html report using the knitr R-package.
The pipelines includes
- quality control of rawdata with FastQC and MultiQC
- Read mapping to the reference genome using STAR
- generation of bigWig tracks for visualisation of alignment with deeptools
- Characterization of insert size for paired-end libraries
- Read quantification with featureCounts (Subread)
- Library complexity assessment with dupRadar
- RNA class representation
- Check for strand specificity
- Visualization of gene body coverage
- Illustration of sample relatedness with MDS plots and heatmaps
- Differential Expression Analysis for depicted group comparisons with DESeq2
- Enrichment analysis for DE results with clusterProfiler and ReactomePA
- Additional DE analysis including multimapped reads
Code Snippets
16 17 18 19 20 21 22 23 24 25 26 27 28 | exec """ ${TOOL_ENV} && ${PREAMBLE} && BASEOUTPUT=`basename $output` && CHRSIZES=\${TMP}/\$(basename ${input.prefix}).bam2bw.chrsizes && samtools idxstats ${input} | cut -f1-2 > \${CHRSIZES} && TOTAL_MAPPED=\$( samtools flagstat $input | head -n5 | tail -n1 | cut -f1 -d" ") && SCALE=\$(echo "1000000/\$TOTAL_MAPPED" | bc -l) && genomeCoverageBed -bg -split -scale \${SCALE} -ibam ${input} | sortBed -i - > \${TMP}/\${BASEOUTPUT%.bw}.bedgraph && bedGraphToBigWig \${TMP}/\${BASEOUTPUT%.bw}.bedgraph \${CHRSIZES} \${TMP}/\${BASEOUTPUT} && cp -f \${TMP}/\${BASEOUTPUT} $output ""","bam2bw" |
20 21 22 23 24 25 | exec """ ${TOOL_ENV} && ${PREAMBLE} && bamCoverage $BAMCOVERAGE_FLAGS --bam $input -o ${output}; ""","bamCoverage" |
15 16 17 18 19 20 | exec """ ${TOOL_ENV} && ${PREAMBLE} && samtools index $input ""","BAMindexer" |
50 51 52 53 54 55 56 57 58 | exec """ ${TOOL_ENV} && ${PREAMBLE} && cutadapt $CUTADAPT_FLAGS $CUTADAPT_FLAGS_PAIRED $CUTADAPT_FLAGS_TOOLONG --too-short-output=\${TMP}/${SAMPLENAME_BASE}.cutadapt_discarded.fastq.gz --output=\${TMP}/${SAMPLENAME_BASE}.cutadapt.fastq.gz $input1 $CUTADAPT_INPUT_SECOND 1> ${CUTADAPT_STATSDIR}/${SAMPLENAME_BASE_PRUNED}.cutadapt.log && mv -t ${CUTADAPT_DISCARDED_DIR}/ \${TMP}/${SAMPLENAME_BASE_PRUNED}*cutadapt_discarded*.fastq.gz && mv -t $output.dir \${TMP}/${SAMPLENAME_BASE_PRUNED}*cutadapt.fastq.gz ""","Cutadapt" |
18 19 20 21 22 23 | exec """ ${TOOL_ENV} && ${PREAMBLE} && fastqc --extract $FASTQC_FLAGS -o $output.dir $inputs ""","FastQC" |
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | exec """ ${TOOL_ENV} && ${PREAMBLE} && if [ ! -e "$output.prefix" ]; then mkdir $output.prefix; fi && fastqreference=${FastqScreen_vars.conf}; references=(\${fastqreference//,/ }); for i in "\${!references[@]}"; do reference=(\${references[i]//::/ }); echo -e "DATABASE\t\${reference[0]}\t\${reference[1]}" >> $output.prefix/fastqscreen.conf; done; fastq_screen $FASTQSCREEN_FLAGS --conf $output.prefix/fastqscreen.conf --outdir $output.prefix $inputs; touch $outputs ""","FastqScreen" |
14 15 16 17 18 19 20 21 | exec """ ${TOOL_ENV} && ${PREAMBLE} && chroms=\$(cut -f1 ${FilterChr_vars.file}) && samtools view -@ ${FilterChr_vars.threads} -b $input \$chroms > $output && samtools index $output ""","FilterChr" |
20 21 22 23 24 25 | exec """ ${TOOL_ENV} && ${PREAMBLE} && java ${InsertSize_vars.java_flags} -jar \${PICARD} CollectInsertSizeMetrics $INSERTSIZE_FLAGS INPUT=$input OUTPUT=$output HISTOGRAM_FILE=${output.prefix}_hist.pdf ""","InsertSize" |
14 15 16 17 18 19 | exec """ ${TOOL_ENV} && ${PREAMBLE} && bam dedup --in $input --out $output --log ${input.prefix}_dupmetrics.log --noPhoneHome ""","MarkDups2" |
16 17 18 19 20 21 | exec """ ${TOOL_ENV} && ${PREAMBLE} && multiqc $ESSENTIAL_PROJECT $MultiQC_FLAGS -o $output.dir ""","MULTIQC" |
26 27 28 29 30 31 | exec """ ${TOOL_ENV} && ${PREAMBLE} && Rscript ${PIPELINE_ROOT}/tools/trackhub/Configure_Trackhub.R $TRACKHUB_FLAGS ""","trackhub_config" |
17 18 19 20 21 22 | exec """ ${TOOL_ENV} && ${PREAMBLE} && Rscript ${PIPELINE_ROOT}/tools/trackhub/Make_Trackhub.R $TRACKHUB_FLAGS ""","trackhub" |
34 35 36 37 38 39 40 | exec """ ${TOOL_ENV} && ${PREAMBLE} && echo $inputs > /dev/null && Rscript ${PIPELINE_ROOT}/tools/DE_DESeq2/DE_DESeq2.R $DE_DESeq2_FLAGS ""","DE_DESeq2" |
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | exec """ ${TOOL_ENV} && ${PREAMBLE} && echo $inputs > /dev/null && if [[ ! -e "$INPUT_READS_DIR" ]]; then mkdir "$INPUT_READS_DIR"; fi && for f in $DE_DESeq2_MM_vars.dupradar_outdir/*.tsv; do F=\$(basename \$f) ; tail -n +2 $f | cut -f1,3 | sort -k1,1 > "$INPUT_READS_DIR/\${F%_dupRadar.tsv}.readcounts.tsv" ; done && Rscript ${PIPELINE_ROOT}/tools/DE_DESeq2/DE_DESeq2.R $DE_DESeq2_MM_FLAGS ""","DE_DESeq2_MM" |
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 | exec """ ${TOOL_ENV} && ${PREAMBLE} && base=`basename $input` && bamdupmark=\${TMP}/\${base%.bam}.dupmarked.bam && bam dedup --in $input --out \${bamdupmark} --log ${output.dir}/\${base%.bam}_dupmetrics.log --noPhoneHome && samtools index \${bamdupmark} && if [[ "${dupRadar_vars.paired}" == "yes" ]]; then echo "We are resorting and doing the repair\n" && bamrepair=\${TMP}/\${base%.bam}.dupmarked.repair.bam && repair -i \${bamdupmark} -T ${dupRadar_vars.threads} -o \${bamrepair} && Rscript ${PIPELINE_ROOT}/tools/dupRadar/dupRadar.R bam=\${bamrepair} $DUPRADAR_FLAGS; else Rscript ${PIPELINE_ROOT}/tools/dupRadar/dupRadar.R bam=\${bamdupmark} $DUPRADAR_FLAGS; fi ""","dupRadar" |
13 14 15 16 17 | exec """ ${PREAMBLE} && tail -n +3 $input | awk '{print \$1\"\\t\"\$7}' > $output """ |
22 23 24 25 26 27 28 29 30 31 | exec """ ${TOOL_ENV} && ${PREAMBLE} && if [[ ! -e "$output.dir" ]]; then mkdir -p "$output.dir"; fi && Rscript ${PIPELINE_ROOT}/tools/geneBodyCov/geneBodyCov.R bam=$input $GENEBODYCOV2_FLAGS ""","geneBodyCov2" |
24 25 26 27 28 29 30 31 | exec """ ${TOOL_ENV} && ${PREAMBLE} && touch $output && Rscript ${PIPELINE_ROOT}/tools/GO_Enrichment/GO_Enrichment.R rData=$input $GO_Enrichment_FLAGS && if [ \$? -ne 0 ]; then rm $output; fi; ""","GO_Enrichment" |
19 20 21 22 23 24 | exec """ ${TOOL_ENV} && ${PREAMBLE} && infer_experiment.py -i $input $INFEREXPERIMENT_FLAGS > $output ""","inferexperiment" |
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 | exec """ ${PREAMBLE} && for i in `cut -f2 $PRERMATS_vars.contrasts | sort | uniq`; do if [[ "\${i}" == "contrast" ]]; then echo "Skipping header line of contrasts file\n"; continue; fi; contrast=\${i:1:-1}; groups[0]=`echo \${i:1:-1} | cut -d- -f1`; groups[1]=`echo \${i:1:-1} | cut -d- -f2`; groups=`echo \${groups[0]} \${groups[1]}`; echo \${groups}; awk -v G="\$groups" 'BEGIN {OFS="\t"}{ split(G, g, / /)} { if( \$3 == g[1] || \$3 == g[2] ) { split(\$2,f,"."); \$2=f[1]; print \$0 } }' $PRERMATS_vars.targets > $output.dir/\${contrast}_targets_rMATS.txt; done """, "PRERMATS" |
25 26 27 28 29 30 31 32 | exec """ ${TOOL_ENV} && ${PREAMBLE} && unset DISPLAY; echo $output.prefix; qualimap rnaseq -bam $input -outdir ${output.prefix}_qualimap -outformat html -gtf ${qualimap_vars.genesgtf} -oc $output -p ${qualimap_vars.protocol} ${qualimap_vars.extra} ""","qualimap" |
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | exec """ ${TOOL_ENV} && ${PREAMBLE} && input_var=`basename $input`; suffix=${rMATS_vars.suffix}; groups=\${input_var%\$suffix}; echo "groups after suffix removal" \$groups; sep=${rMATS_vars.sep}; groups=(\${groups//\$sep/ }); echo "groups 0 " \${groups[0]}; add_extensions=""; if [[ "$RUN_CUTADAPT" == "true" ]]; then echo "Will add cutadapt extension"; add_extensions=".cutadapt"; fi; echo \${add_extensions}; mkdir -p $output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS && bamgroup0=`awk -v g="\${groups[0]}" -v M="$MAPPED" -v C="\${add_extensions}" 'BEGIN{OFS=""} {if (\$3 == g) print M "/" \$2 C ".bam"}' $input | paste -sd, -` && echo \$bamgroup0 > $output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS/\${groups[0]}_samples.txt && bamgroup1=`awk -v g="\${groups[1]}" -v M="$MAPPED" -v C="\${add_extensions}" 'BEGIN{OFS=""} {if (\$3 == g) print M "/" \$2 C ".bam"}' $input | paste -sd, -` && echo \$bamgroup1 > $output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS/\${groups[1]}_samples.txt && run_rmats --b1 $output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS/\${groups[0]}_samples.txt --b2 $output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS/\${groups[1]}_samples.txt $RMATS_FLAGS --od $output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS --tmp $output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS_tmp && Rscript ${PIPELINE_ROOT}/tools/maser/createMaserPlots.R $MASER_FLAGS rmats_dir=$output.dir/\${input_var%${rMATS_vars.suffix}}_rMATS scripts_dir=${PIPELINE_ROOT}/tools/maser group1=\${groups[0]} group2=\${groups[1]} && rm -rf $output.dir/*_tmp/ && touch $output; ""","rMATS" |
13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | exec """ ${PREAMBLE} && cp ${PIPELINE_ROOT}/tools/reports/shiny_rnaseq_reporting_tool/DE.shinyrep.helpers.R ${REPORTS} && cp ${PIPELINE_ROOT}/tools/reports/shiny_rnaseq_reporting_tool/styles.css ${REPORTS} && if [ -e "${REPORTS}/DEreport.Rmd" ]; then echo 'DEreport.Rmd already exists. Older copy will be kept and not overwritten'; else cp ${PIPELINE_ROOT}/tools/reports/shiny_rnaseq_reporting_tool/DEreport.Rmd ${REPORTS}; fi && PROJECT=\$(basename ${shinyReports_vars.project}) && sed -i "2,2s/SHINYREPS_PROJECT/\${PROJECT}/" ${REPORTS}/DEreport.Rmd && echo "SHINYREPS_PROJECT=${shinyReports_vars.project}" > $output && echo "SHINYREPS_ORG=${shinyReports_vars.org}" >> $output && echo "SHINYREPS_DB=${shinyReports_vars.db}" >> $output && echo "SHINYREPS_LOG=${shinyReports_vars.log}" >> $output && echo "SHINYREPS_PAIRED=${shinyReports_vars.paired}" >> $output && echo "SHINYREPS_QC=${shinyReports_vars.qc}" >> $output && echo "SHINYREPS_RES=${shinyReports_vars.res}" >> $output && echo "SHINYREPS_PREFIX=${shinyReports_vars.prefix}" >> $output && echo "SHINYREPS_STRANDEDNESS=${shinyReports_vars.strandedness}" >> $output && echo "SHINYREPS_STAR_LOG=${shinyReports_vars.star_log}" >> $output && echo "SHINYREPS_STAR_SUFFIX=${shinyReports_vars.star_suffix}" >> $output && echo "SHINYREPS_STARparms_SUFFIX=${shinyReports_vars.starparms_suffix}" >> $output && echo "SHINYREPS_FASTQC_OUT=${shinyReports_vars.fastqc_out}" >> $output && echo "SHINYREPS_FASTQC_LOG=${shinyReports_vars.fastqc_log}" >> $output && echo "SHINYREPS_FASTQC_SUMMARIZED=${shinyReports_vars.fastqc_summarized}" >> $output && echo "SHINYREPS_FASTQSCREEN_OUT=${shinyReports_vars.fastqscreen_out}" >> $output && echo "SHINYREPS_FASTQSCREEN_LOG=${shinyReports_vars.fastqscreen_log}" >> $output && echo "SHINYREPS_FASTQSCREEN_PERC=${shinyReports_vars.fastqscreen_perc}" >> $output && echo "SHINYREPS_RUN_CUTADAPT=${shinyReports_vars.run_cutadapt}" >> $output && echo "SHINYREPS_CUTADAPT_STATS=${shinyReports_vars.cutadapt_stats}" >> $output && echo "SHINYREPS_BAMINDEX_LOG=${shinyReports_vars.bamindex_log}" >> $output && echo "SHINYREPS_DUPRADAR_LOG=${shinyReports_vars.dupradar_log}" >> $output && echo "SHINYREPS_RNATYPES_LOG=${shinyReports_vars.rnatypes_log}" >> $output && echo "SHINYREPS_RNATYPES=${shinyReports_vars.rnatypes}" >> $output && echo "SHINYREPS_RNATYPES_SUFFIX=${shinyReports_vars.rnatypes_suffix}" >> $output && echo "SHINYREPS_GENEBODYCOV_LOG=${shinyReports_vars.genebodycov_log}" >> $output && echo "SHINYREPS_DE_DESEQ=${shinyReports_vars.de_deseq}" >> $output && echo "SHINYREPS_DE_DESEQ_MM=${shinyReports_vars.de_deseq_mm}" >> $output && echo "SHINYREPS_DE_DESEQ_FDR=${shinyReports_vars.de_deseq_FDR}" >> $output && echo "SHINYREPS_DE_DESEQ_FC=${shinyReports_vars.de_deseq_FC}" >> $output && echo "SHINYREPS_MASER_SCRIPTS=${shinyReports_vars.maser_scripts}" >> $output && echo "SHINYREPS_MASER_FTYPE=${shinyReports_vars.maser_ftype}" >> $output && echo "SHINYREPS_MASER_MINCOV=${shinyReports_vars.maser_mincov}" >> $output && echo "SHINYREPS_MASER_FDR=${shinyReports_vars.maser_fdr}" >> $output && echo "SHINYREPS_MASER_DPSI=${shinyReports_vars.maser_dpsi}" >> $output && echo "SHINYREPS_SUBREAD=${shinyReports_vars.subread}" >> $output && echo "SHINYREPS_SUBREAD_SUFFIX=${shinyReports_vars.subread_suffix}" >> $output && echo "SHINYREPS_SUBREAD_LOG=${shinyReports_vars.subread_log}" >> $output && echo "SHINYREPS_BAM2BW_LOG=${shinyReports_vars.bam2bw_log}" >> $output && echo "SHINYREPS_MARKDUPS_LOG=${shinyReports_vars.markdups_log}" >> $output && echo "SHINYREPS_DESEQ_LOGS=${shinyReports_vars.deseq_logs}" >> $output && echo "SHINYREPS_PLOTS_COLUMN=${shinyReports_vars.plots_column}" >> $output && echo "SHINYREPS_SORT_ALPHA=${shinyReports_vars.sort_alpha}" >> $output && echo "SHINYREPS_INFEREXPERIMENT_LOGS=${shinyReports_vars.inferexperiment_logs}" >> $output && echo "SHINYREPS_QUALIMAP_LOGS=${shinyReports_vars.qualimap_logs}" >> $output && echo "SHINYREPS_INSERTSIZE=${shinyReports_vars.insertsize}" >> $output && echo "SHINYREPS_GO_ENRICHMENT=${shinyReports_vars.go_enrichment}" >> $output && echo "SHINYREPS_TRACKHUB_DONE=${shinyReports_vars.trackhub_done}" >> $output && echo "SHINYREPS_TOOL_VERSIONS=${shinyReports_vars.tool_versions}" >> $output && echo "SHINYREPS_TARGET=${shinyReports_vars.target}" >> $output ""","shinyReports" |
54 55 56 57 58 59 60 61 62 63 64 65 66 67 | exec """ ${TOOL_ENV} && ${PREAMBLE} && if [ -e \${TMP}/$OUTPUTFILE ]; then echo 'removing old STAR tmp folder'; rm -r \${TMP}/$OUTPUTFILE*; fi && STAR $STAR_FLAGS --outTmpDir \${TMP}/$OUTPUTFILE --readFilesIn $inputs | samtools view $SAMTOOLS_VIEW_FLAGS - | samtools sort $SAMTOOLS_SORT_FLAGS -T \${TMP}/${OUTPUTFILE}_sort - > $output1 && mv ${STAR_vars.logdir}/${OUTPUTFILE}SJ.out.tab $output.dir && ln -s ${STAR_vars.logdir}/${OUTPUTFILE}Log.final.out $output.dir ""","STAR" |
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | exec """ ${TOOL_ENV} && ${PREAMBLE} && base=`basename $input` && if [[ "${subread2rnatypes_vars.paired}" == "true" ]]; then echo "We are resorting and doing the repair\n" && repair -i $input -T ${subread2rnatypes_vars.threads} -o \${TMP}/\${base} && featureCounts $RNATYPES_FLAGS -o \${TMP}/\${base} \${TMP}/\${base} 2> ${output.prefix}_rnatypeslog.stderr; else featureCounts $RNATYPES_FLAGS -o \${TMP}/\${base} $input 2> ${output.prefix}_rnatypeslog.stderr; fi && cut -f1,6,7 \${TMP}/\${base} > $output && rm \${TMP}/\${base} ""","subread2rnatypes" |
23 24 25 26 27 28 29 30 31 32 33 34 35 36 | exec """ ${TOOL_ENV} && ${PREAMBLE} && base=`basename $input` && if [[ "${subread_count_vars.paired}" == "true" ]]; then echo "We are resorting and doing the repair\n" && repair -i $input -T ${subread_count_vars.threads} -o \${TMP}/\${base} && featureCounts $SUBREAD_FLAGS -o $output \${TMP}/\${base} 2> ${output.prefix}_subreadlog.stderr; else featureCounts $SUBREAD_FLAGS -o $output $input 2> ${output.prefix}_subreadlog.stderr; fi ""","subread_count" |
19 20 21 22 23 24 25 | exec """ ${TOOL_ENV} && ${PREAMBLE} && Rscript ${PIPELINE_ROOT}/tools/TPMs/TPMs.R -c $input -o $output $TPM_FLAGS ""","tpm" |
Support
- Future updates