CoVigator pipeline: variant detection pipeline for Sars-CoV-2 (and other viruses...)

public public 1yr ago Version: v0.17.0 0 bookmarks

The Covigator pipeline transforms SARS-CoV-2 FASTQ or FASTA files into annotated and normalized VCF files for analysis. It also uses pangolin to classify samples into lineages. The pipeline is built on the Nextflow architecture (Di Tommaso, 2017), and it may be utilized independently of the CoVigator dashboard and knowledge base. Although it is set up by default to analyze SARS-CoV-2, it can also be used to analyze other microbiological organisms if the necessary references are provided. The process produces one or more annotated VCFs with a list of SNVs and indels ready for analysis.

Code Snippets

24
25
26
27
28
29
30
"""
mkdir -p reference
cp ${reference} reference/sequences.fa
bwa-mem2 index reference/sequences.fa
samtools faidx reference/sequences.fa
gatk CreateSequenceDictionary --REFERENCE reference/sequences.fa 
"""
52
53
54
55
56
57
58
59
"""
mkdir -p snpeff/${snpeff_organism}
echo ${snpeff_organism}.genome : ${snpeff_organism} > snpeff/snpEff.config
cp ${reference} snpeff/${snpeff_organism}/sequences.fa
cp ${gff} snpeff/${snpeff_organism}/genes.gff
cd snpeff
snpEff build -gff3 -v ${snpeff_organism} -dataDir .
"""
22
23
24
25
26
27
28
29
30
31
"""
# --input_files needs to be forced, otherwise it is inherited from profile in tests
fastp \
--in1 ${fastq1} \
--in2 ${fastq2} \
--out1 ${fastq1.baseName}.trimmed.fq.gz \
--out2 ${fastq2.baseName}.trimmed.fq.gz \
--json ${name}.fastp_stats.json \
--html ${name}.fastp_stats.html
"""
50
51
52
53
54
55
56
57
"""
# --input_files needs to be forced, otherwise it is inherited from profile in tests
fastp \
--in1 ${fastq1} \
--out1 ${fastq1.baseName}.trimmed.fq.gz \
--json ${name}.fastp_stats.json \
--html ${name}.fastp_stats.html
"""
19
20
21
22
23
"""
bwa-mem2 mem -t ${task.cpus} ${reference} ${fastq1} ${fastq2} | \
samtools view -uS - | \
samtools sort - > ${name}.bam
"""
40
41
42
43
44
"""
bwa-mem2 mem -t ${task.cpus} ${reference} ${fastq1} | \
samtools view -uS - | \
samtools sort - > ${name}.bam
"""
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
mkdir tmp

gatk CleanSam \
--java-options '-Xmx${params.memory} -Djava.io.tmpdir=./tmp' \
--INPUT ${bam} \
--OUTPUT /dev/stdout | \
gatk AddOrReplaceReadGroups \
--java-options '-Xmx${params.memory} -Djava.io.tmpdir=./tmp' \
--VALIDATION_STRINGENCY SILENT \
--INPUT /dev/stdin \
--OUTPUT ${name}.prepared.bam \
--REFERENCE_SEQUENCE ${reference} \
--RGPU 1 \
--RGID 1 \
--RGSM ${name} \
--RGLB 1 \
--RGPL ILLUMINA

# removes duplicates (sorted from the alignment process)
sambamba markdup \
    -r \
    -t ${task.cpus} \
    --tmpdir=./tmp \
    ${name}.prepared.bam ${name}.preprocessed.bam

# removes intermediate BAM files
rm -f ${name}.prepared.bam

# indexes the output BAM file
sambamba index \
    -t ${task.cpus} \
    ${name}.preprocessed.bam ${name}.preprocessed.bai
"""
78
79
80
81
82
83
84
85
86
87
88
89
90
91
"""
ivar trim \
-i ${bam} \
-b ${primers} \
-p ${bam.baseName}.trimmed

gatk SortSam \
--java-options '-Xmx${params.memory}  -Djava.io.tmpdir=./tmp' \
--INPUT ${bam.baseName}.trimmed.bam \
--OUTPUT ${bam.baseName}.trimmed.sorted.bam \
--SORT_ORDER coordinate

gatk BuildBamIndex --INPUT ${bam.baseName}.trimmed.sorted.bam
"""
111
112
113
114
"""
samtools coverage ${bam} > ${name}.coverage.tsv
samtools depth -s -d 0 -H ${bam} > ${name}.depth.tsv
"""
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""
bcftools mpileup ${params.args_bcftools_mpileup} \
--redo-BAQ \
--max-depth 0 \
--min-BQ ${params.min_base_quality} \
--min-MQ ${params.min_mapping_quality} \
--count-orphans \
--fasta-ref ${reference} \
--annotate AD ${bam} | \
bcftools call ${params.args_bcftools_call} \
--multiallelic-caller \
--variants-only \
 --ploidy 1 \
 --output-type b - > ${name}.bcftools.bcf
"""
71
72
73
74
75
76
77
78
79
80
81
82
83
"""
lofreq call ${params.args_lofreq} \
--min-bq ${params.min_base_quality} \
--min-alt-bq ${params.min_base_quality} \
--min-mq ${params.min_mapping_quality} \
--ref ${reference} \
--call-indels \
<( lofreq indelqual --dindel --ref ${reference} ${bam} ) | bgzip > ${name}.lofreq.vcf.gz

# NOTE: adding the tabix index is a dirty fix to deal with LoFreq VCF missing the chromosome in the header
bcftools index ${name}.lofreq.vcf.gz
bcftools view --output-type b ${name}.lofreq.vcf.gz > ${name}.lofreq.bcf
"""
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
mkdir tmp
gatk HaplotypeCaller ${params.args_gatk} \
--java-options '-Xmx${params.memory} -Djava.io.tmpdir=tmp' \
--input $bam \
--output ${name}.gatk.vcf \
--reference ${reference} \
--ploidy 1 \
--min-base-quality-score ${params.min_base_quality} \
--minimum-mapping-quality ${params.min_mapping_quality} \
--annotation AlleleFraction

bcftools view --output-type b ${name}.gatk.vcf > ${name}.gatk.bcf
"""
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""
samtools mpileup ${params.args_ivar_samtools} \
-aa \
--count-orphans \
--max-depth 0 \
--redo-BAQ \
--min-BQ ${params.min_base_quality} \
--min-MQ ${params.min_mapping_quality} \
--fasta-ref ${reference} \
${bam} | \
ivar variants ${params.args_ivar} \
-p ${name}.ivar \
-q ${params.min_base_quality} \
-t 0.03 \
-r ${reference} \
-g ${gff}
"""
172
173
174
175
176
177
178
179
"""
ivar2vcf.py \
--fasta ${reference} \
--ivar ${tsv} \
--output-vcf ${name}.ivar.vcf

bcftools view --output-type b ${name}.ivar.vcf > ${name}.ivar.bcf
"""
200
201
202
203
204
205
206
207
208
209
"""
assembly_variant_caller.py \
--fasta ${fasta} \
--reference ${reference} \
--output-vcf ${name}.${caller}.vcf \
--match-score $params.match_score \
--mismatch-score $params.mismatch_score \
--open-gap-score $params.open_gap_score \
--extend-gap-score $params.extend_gap_score
"""
25
26
27
28
29
30
31
32
33
34
35
"""
# initial sort of the VCF
bcftools sort ${vcf} | \

# checks reference genome, decompose multiallelics, trim and left align indels
bcftools norm --multiallelics -any --check-ref e --fasta-ref ${reference} \
--old-rec-tag OLD_CLUMPED - | \

# remove duplicates after normalisation
bcftools norm --rm-dup exact -o ${name}.${caller}.normalized.vcf -
"""
57
58
59
60
61
62
63
"""
phasing.py \
--fasta ${fasta} \
--gtf ${gtf} \
--input-vcf ${vcf} \
--output-vcf ${name}.${caller}.phased.vcf
"""
39
40
41
42
43
44
45
46
47
48
"""
# for some reason the snpEff.config file needs to be in the folder where snpeff runs...
cp ${snpeff_config} .

snpEff eff -Xmx${memory} -dataDir ${snpeff_data} \
-noStats -no-downstream -no-upstream -no-intergenic -no-intron -onlyProtein -hgvs1LetterAa -noShiftHgvs \
${snpeff_organism}  ${vcf} | bgzip -c > ${name}.${caller}.vcf.gz

tabix -p vcf ${name}.${caller}.vcf.gz
"""
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
bgzip -c ${vcf} > ${name}.vcf.gz

tabix -p vcf ${name}.vcf.gz

# annotates low frequency and subclonal variants
bcftools view -Ob ${name}.vcf.gz | \
bcftools filter \
--exclude 'INFO/vafator_af < ${params.low_frequency_variant_threshold}' \
--soft-filter LOW_FREQUENCY - | \
bcftools filter \
--exclude 'INFO/vafator_af >= ${params.low_frequency_variant_threshold} && INFO/vafator_af < ${params.subclonal_variant_threshold}' \
--soft-filter SUBCLONAL \
--output-type v - | \
bcftools filter \
--exclude 'INFO/vafator_af >= ${params.subclonal_variant_threshold} && INFO/vafator_af < ${params.lq_clonal_variant_threshold}' \
--soft-filter LOW_QUALITY_CLONAL \
--output-type v - > ${name}.${caller}.vcf
"""
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
bcftools annotate \
--annotations ${params.conservation_sarscov2} \
--header-lines ${params.conservation_sarscov2_header} \
-c CHROM,FROM,TO,CONS_HMM_SARS_COV_2 \
--output-type z ${vcf} | \
bcftools annotate \
--annotations ${params.conservation_sarbecovirus} \
--header-lines ${params.conservation_sarbecovirus_header} \
-c CHROM,FROM,TO,CONS_HMM_SARBECOVIRUS \
--output-type z - | \
bcftools annotate \
--annotations ${params.conservation_vertebrate} \
--header-lines ${params.conservation_vertebrate_header} \
-c CHROM,FROM,TO,CONS_HMM_VERTEBRATE_COV \
--output-type z - | \
bcftools annotate \
--annotations ${params.pfam_names} \
--header-lines ${params.pfam_names_header} \
-c CHROM,FROM,TO,PFAM_NAME \
--output-type z - | \
bcftools annotate \
--annotations ${params.pfam_descriptions} \
--header-lines ${params.pfam_descriptions_header} \
-c CHROM,FROM,TO,PFAM_DESCRIPTION - | \
bcftools filter \
--exclude 'POS <= 55 | POS >= 29804' \
--output-type z - > annotated_sarscov2.vcf.gz

tabix -p vcf annotated_sarscov2.vcf.gz

bcftools annotate \
--annotations ${params.problematic_sites} \
--columns INFO/problematic:=FILTER annotated_sarscov2.vcf.gz > ${name}.${caller}.annotated_sarscov2.vcf
"""
165
166
167
168
169
170
"""
vafator \
--input-vcf ${vcf} \
--output-vcf ${name}.${caller}.vaf.vcf \
--bam vafator ${bam} ${mq_param} ${bq_param}
"""
25
26
27
28
29
30
31
32
33
34
"""
mkdir tmp

#--decompress-model
pangolin \
${fasta} \
--outfile ${name}.${caller}.pangolin.csv \
--tempdir ./tmp \
--threads ${params.cpus}
"""
53
54
55
56
57
58
59
60
61
62
"""
bcftools view -O b -o ${name}.bcf ${vcf}
bcftools index ${name}.bcf

# GATK results have all FILTER="."
bcftools consensus --fasta-ref ${reference} \
--include 'FILTER="PASS" | FILTER="."' \
--output ${name}.${caller}.fasta \
${name}.bcf
"""
24
25
26
27
"""
bgzip -c ${vcf} > ${name}.${caller}.vcf.gz
tabix -p vcf ${name}.${caller}.vcf.gz
"""
ShowHide 15 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/TRON-Bioinformatics/covigator-ngs-pipeline
Name: covigator
Version: v0.17.0
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...