snakemake workflow for bulk RNA-seq workflow using STAR-edgeR
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
Usage
NOTE this workflow is optimized for the HPC @ Van Andel Institute.
Step 1: Configure the workflow
-
Move your sequencing reads to
raw_data/
-
Modify the config and samplesheet:
-
config/samplesheet/units.tsv; To make a template based in the files in
raw_data/
, run./make_units_template.sh
.-
sample - ID of biological sample; Must be unique.
-
group - Experimental group
-
fq1 - name of read1 fastq
-
fq2 * - name of read2 fastq
-
-
config/config.yaml
-
Step 2: Test and run the workflow
Test your configuration by performing a dry-run via
snakemake -npr
Execute from within your project directory as a SLURM job.
sbatch bin/run_snake.sh
Code Snippets
25 26 27 28 | shell: """ {params.cat_or_symlink} {output} """ |
44 45 46 47 | shell: """ fastq_screen --threads {threads} --outdir results/fastq_screen/ {input} """ |
68 69 70 71 | shell: """ fastqc --outdir {params.outdir} {input} """ |
87 88 89 90 | shell: """ seqtk sample -s 100 {input} {params.num_subsamp} | gzip -c > {output} """ |
124 125 126 127 128 129 130 131 132 133 134 135 136 | shell: """ sortmerna --threads {threads} {params.fastqs} --workdir {output} \ --idx-dir {params.idx_dir} \ --ref {params.rfam5s} \ --ref {params.rfam5_8s} \ --ref {params.silva_arc_16s} \ --ref {params.silva_arc_23s} \ --ref {params.silva_bac_16s} \ --ref {params.silva_bac_23s} \ --ref {params.silva_euk_18s} \ --ref {params.silva_euk_28s} """ |
180 181 182 183 184 185 186 187 | shell: """ multiqc -f {params} \ -o results/multiqc \ --ignore '*._STARpass1/*' \ -n multiqc_report.html \ --cl-config 'max_table_rows: 999999' """ |
26 27 28 29 | shell: """ ln -sr {input} {output} """ |
65 66 67 68 69 70 71 72 73 74 | shell: """ trim_galore \ --paired \ {input} \ --output_dir {params.outdir} \ --cores {threads} \ -q 20 \ --fastqc """ |
95 96 97 98 99 100 101 102 103 | shell: """ trim_galore \ {input} \ --output_dir {params.outdir} \ --cores {threads} \ -q 20 \ --fastqc """ |
183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | shell: """ STAR \ --runThreadN {threads} \ --genomeDir {params.index} \ --readFilesIn {params.in_fastqs} \ --outSAMattrRGline {params.read_groups} \ --twopassMode Basic \ --readFilesCommand zcat \ --outSAMtype BAM Unsorted \ --outFileNamePrefix {params.outprefix} \ --quantMode GeneCounts \ --outStd Log samtools sort -@ {threads} -o {output.bam} {output.unsorted_bam} samtools index {output.bam} """ |
219 220 221 222 223 224 225 226 227 228 | shell: """ salmon quant \ -p {threads} \ -l A \ -i {input.index} \ {params.reads} \ --validateMappings \ -o {params.outdir} """ |
249 250 251 252 | shell: """ Rscript --vanilla workflow/scripts/make_sce.R {params.gtf} {params.orgdb} {output.se} {output.sce} {output.sizeFactors} """ |
18 19 20 21 22 23 24 25 26 27 28 29 30 31 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ MarkDuplicatesSpark \ --spark-master local[{threads}] \ -I {input} \ -O {output.bam} \ -M {output.metrics} \ --conf spark.executor.cores={threads} \ --conf spark.local.dir=./tmp \ --conf spark.driver.memory=6g \ --conf spark.executor.memory=5g """ |
48 49 50 51 52 53 54 55 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ SplitNCigarReads \ -R {params.ref_fasta} \ -I {input} \ -O {output} """ |
75 76 77 78 79 80 81 82 83 84 85 86 87 88 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ HaplotypeCaller \ -R {params.ref_fasta} \ -I {input.bam} \ -O {output} \ -ERC GVCF \ -dont-use-soft-clipped-bases \ --native-pair-hmm-threads {threads} \ --standard-min-confidence-threshold-for-calling 20 \ {params.dbsnp} \ {params.contigs} """ |
108 109 110 111 112 113 114 115 116 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ GenomicsDBImport \ {params.sample_gvcfs} \ --reader-threads {threads} \ --genomicsdb-workspace-path {output.genomicsdb} \ {params.contigs} """ |
134 135 136 137 138 139 140 141 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ GenotypeGVCFs \ -R {params.ref_fasta} \ -V gendb://{params.genomicsdb} \ -O {output.vcf} """ |
161 162 163 164 165 166 167 168 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ SortVcf \ -I {input.vcf} \ -O {output.sorted_vcf} \ -SD {params.dictionary} """ |
193 194 195 196 197 198 199 200 201 202 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ MergeVcfs \ {params.in_vcfs} \ --SEQUENCE_DICTIONARY {params.dictionary} \ --OUTPUT {output.raw} vt peek -r {params.ref_fasta} {output.raw} 2> {output.vt_peek_raw} """ |
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ SelectVariants \ -R {params.ref_fasta} \ -V {input} \ --select-type-to-include {wildcards.var_type} \ -O {output.raw} echo "SelectVariants 1 done." echo "SelectVariants 1 done." 1>&2 gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ VariantFiltration \ --R {params.ref_fasta} \ --V {output.raw} \ {params.filt_params} \ -O {output.filt} echo "VariantFiltration done." echo "VariantFiltration done." 1>&2 gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ SelectVariants \ -R {params.ref_fasta} \ -V {output.filt} \ --exclude-filtered \ -O {output.pass_only} echo "SelectVariants 2 done." echo "SelectVariants 2 done." 1>&2 vt peek -r {params.ref_fasta} {output.pass_only} 2> {output.vt_peek_pass} """ |
302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 | shell: """ gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ BaseRecalibrator \ -R {params.ref_fasta} \ -I {input.bam} \ --known-sites {input.known_snps_vcf} \ --known-sites {input.known_indels_vcf} \ -O {output.recal_table} echo "BaseRecalibrator done." echo "BaseRecalibrator done." 1>&2 gatk --java-options "-Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp" \ ApplyBQSR \ -R {params.ref_fasta} \ -I {input.bam} \ -bqsr {output.recal_table} \ -O {output.recal_bam} echo "ApplyBQSR done." echo "ApplyBQSR done." 1>&2 """ |
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | shell: """ java -Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp -jar $SNPEFF/snpEff.jar eff \ -v \ -canon \ -onlyProtein \ -stats {output.html_canon} \ {params.db_id} \ {input} | \ bgzip > {output.vcf_canon} tabix {output.vcf_canon} java -Xms8g -Xmx{resources.mem_gb}g -Djava.io.tmpdir=./tmp -jar $SNPEFF/snpEff.jar eff \ -v \ -onlyProtein \ -stats {output.html} \ {params.db_id} \ {input} | \ bgzip > {output.vcf} tabix {output.vcf} """ |
394 395 396 397 398 399 400 401 402 | shell: """ ln -sr {input.vcf} {output.symlink_vcf} ln -sr {params.rmd} {output.symlink_rmd} cd {params.wd} Rscript --vanilla -e "rmarkdown::render('snprelate.Rmd', params = list(in_vcf = '{params.in_vcf}', outdir = '{params.outdir}'))" """ |
32 33 34 35 36 37 | shell: """ bamCoverage -b {input.bam} -o {output.bw} --minMappingQuality 30 \ --normalizeUsing "None" -p {threads} --binSize 10 \ --scaleFactor {params.scale_factor} {params.strand} """ |
54 55 56 57 | shell: """ bigwigAverage -b {input} --binSize 10 -p {threads} -o {output.bw} -of "bigwig" """ |
Support
- Future updates