Realignment Pipeline for Increased Reproducibility and Portability

public public 1yr ago 0 bookmarks

About

This realignment pipeline is a reimplantation of the pipeline found at https://github.com/BrenKenna/data_processing. The original pipeline is tailored to run on the grid infrastructure.

To increase reproducibility, portability and

Code Snippets

47
48
49
50
51
52
53
54
55
56
57
shell:
    """
    set +u
        eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
        conda activate /cvmfs/softdrive.nl/projectmine_sw/software
        set -u
        pm_copy /projectmine-nfs/Tape/User/kooyman/UploadFromBroad/{wildcards.SM}.cram  {output.cram}
        set +u
        conda deactivate
        set -u
    """
80
81
82
83
84
85
86
87
88
89
90
91
92
93
shell:
    """
    echo "start $(date)"
    samtools index -@ {threads} {input.cram}
    echo "indexed $(date)"
    samtools collate -@ {threads} --output-fmt BAM -uOn 128 {input.cram} {wildcards.SM}.tmp |samtools bam2fq -@ {threads} -t -s /dev/null -1 {wildcards.SM}.R1.fq.gz -2 {wildcards.SM}.R2.fq.gz - > /dev/null
     echo "collated $(date)"
    bwa-mem2 mem -K 100000000 -t {threads} -Y {params.ref} -R "@RG\\tID:{wildcards.SM}\\tLB:{wildcards.SM}\\tSM:{wildcards.SM}\\tPL:ILLUMINA" {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz 2>> {log} | samblaster -a --addMateTags | samtools view -h1 --threads {threads} -o  {wildcards.SM}.aln.bam
     echo "bwa $(date)"
    rm -f {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz
    samtools sort -T /tmp/{wildcards.SM} --write-index  -m 7G -l 1 -@ {threads} {wildcards.SM}.aln.bam -o {output.bam}##idx##{output.index}

    echo "sorted $(date)"
    """
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
shell:
    """
    echo "start $(date)"
    picard -Djava.io.tmpdir={resources.tmpdir} MarkDuplicates I={input.bam} AS=true O={wildcards.SM}.dedup.bam METRICS_FILE={output.metrics_dedup} QUIET=true COMPRESSION_LEVEL=1 2>> {log.dedup}
    #why sort this stuff
    echo "MarkDuplicated $(date)"
    #samtools sort -m 7G  -l 1  -T /tmp/{wildcards.SM} --write-index  -@ {threads} {wildcards.SM}.dedup.bam -o {wildcards.SM}.dedup-sorted.bam##idx##{wildcards.SM}.dedup-sorted.bam.bai 
    #echo "sorted dedup.bam $(date)"
    #rm {wildcards.SM}.dedup.bam
    samtools index -@ {threads} {wildcards.SM}.dedup.bam
    echo "indexed dedup $(date)"
    gatk3 -Xmx8G -Djava.io.tmpdir={resources.tmpdir} -T BaseRecalibrator -I {wildcards.SM}.dedup.bam -R {params.ref} -o {wildcards.SM}.recal -nct {threads} --downsample_to_fraction .1 {params.bqsr_loci} -knownSites {params.dbSNP} -knownSites {params.Mills} -knownSites {params.KnownIndels} 2>> {log.bqsr}
    echo "BaseRecalibrator done $(date)"

    gatk3 -Xmx8G -Djava.io.tmpdir={resources.tmpdir} -T PrintReads -I {wildcards.SM}.dedup.bam -R {params.ref} -nct {threads} --BQSR {wildcards.SM}.recal -o {output.bam} --globalQScorePrior -1.0 --preserve_qscores_less_than 6 --static_quantized_quals 10 --static_quantized_quals 20 --static_quantized_quals 30 --disable_indel_quals 2>> {log.bqsr}
    echo "printedreads $(date)"
    """
162
163
164
165
166
167
shell:
    """
    samtools view -C -O CRAM -T {params.ref} -@ {threads} {input.bam} -o {output.cram}
    samtools index -@ {threads} {output.cram}

    """
188
189
190
191
shell:
    """
    gatk --java-options -Djava.io.tmpdir={resources.tmpdir} HaplotypeCaller -R {params.ref}  --dbsnp {params.dbSNP}  -I {input.cram} -O {output} -L {wildcards.chr} --seconds-between-progress-updates 100 -ERC GVCF --native-pair-hmm-threads {threads} 
    """
210
211
212
213
214
shell:
    """
    bcftools concat -O z  -o {output.gvcf} {input}
    tabix -p vcf  {output.gvcf}
    """
237
238
239
240
241
242
243
244
245
shell:
    """
    touch {input.index}
    gatk --java-options -Djava.io.tmpdir={resources.tmpdir} HaplotypeCaller -R {params.ref}  --dbsnp {params.dbSNP}  -I {input.cram} -O {output.gvcf} -ERC GVCF -L {params.tgt} --seconds-between-progress-updates 100 --native-pair-hmm-threads {threads} &>> {log}
    mv {output.gvcf} {output.gvcf}.tmp
    zcat {output.gvcf}.tmp |bgzip -@ {threads} -l 6 -c> {output.gvcf}

    tabix -fp vcf {output.gvcf}
    """
258
259
260
261
shell:
    """
    samtools index {input.cram}
    """
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
shell:
    """
     echo "start $(date)"
    samtools index -@ 8 {input.cram}
    echo "index $(date)"
    samtools bamshuf -@ 8 --reference {params.refold} --output-fmt BAM -uOn 128 {input.cram} {wildcards.SM}.tmp |samtools bam2fq -@ 8 -t -s /dev/null -1 {wildcards.SM}.R1.fq.gz -2 {wildcards.SM}.R2.fq.gz - > /dev/null
    echo "bamshuf $(date)"
    bwa mem -K 100000000 -t 8 -Y {params.ref} -R "@RG\\tID:{wildcards.SM}\\tLB:{wildcards.SM}\\tSM:{wildcards.SM}\\tPL:ILLUMINA" {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz 2>> {log.bwa} | samblaster -a --addMateTags | samtools view -h --threads {threads} -bS > {wildcards.SM}.aln.bam
    echo "bwa $(date)"
    rm -f {wildcards.SM}.R1.fq.gz {wildcards.SM}.R2.fq.gz
    samtools sort -@ 8 {wildcards.SM}.aln.bam > {wildcards.SM}.sorted.bam
    echo "sorted $(date)"
    rm -f {wildcards.SM}.aln.bam
    samtools view -@ 8 -h -T {params.ref} -C {wildcards.SM}.sorted.bam > {output}
    rm -f {wildcards.SM}.sorted.bam
    echo "cramed $(date)"
    """
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
shell:
    """
    wrk="."
    echo "start $(date)"
    samtools view -h -@ 8 -T {params.ref} {input.cram} -b > {wildcards.SM}.bam
     echo "cram2bam input $(date)"
    samtools index -@ 8 {wildcards.SM}.bam
    echo "indexed input $(date)"
    picard -Djava.io.tmpdir=${{wrk}} MarkDuplicates I={wildcards.SM}.bam AS=true O={wildcards.SM}.dedup.bam METRICS_FILE={output.metrics_dedup} QUIET=true COMPRESSION_LEVEL=0 2>> {log.dedup}
    rm {wildcards.SM}.bam
    echo "MarkDuplicated $(date)"
    samtools sort -@ 8 -o {wildcards.SM}.dedup-sorted.bam {wildcards.SM}.dedup.bam
    echo "sorted dedup.bam $(date)"
    rm {wildcards.SM}.dedup.bam
    echo "removed dedup.bam $(date)"

    samtools index -@ 8 {wildcards.SM}.dedup-sorted.bam
    echo "indexed dedup-sorted.bam $(date)"

    gatk3 -Xmx8g -Djava.io.tmpdir=${{wrk}} -T BaseRecalibrator -I {wildcards.SM}.dedup-sorted.bam -R {params.ref} -o {wildcards.SM}.recal -nct {threads} --downsample_to_fraction .1 {params.bqsr_loci} -knownSites {params.dbSNP} -knownSites {params.Mills} -knownSites {params.KnownIndels} 2>> {log.bqsr}
    echo "BaseRecalibrator done $(date)"
    gatk3 -Xmx8g -Djava.io.tmpdir=${{wrk}}  -T PrintReads -I {wildcards.SM}.dedup-sorted.bam -R {params.ref} -nct {threads} --BQSR {wildcards.SM}.recal -o {wildcards.SM}.bqsr.bam --globalQScorePrior -1.0 --preserve_qscores_less_than 6 --static_quantized_quals 10 --static_quantized_quals 20 --static_quantized_quals 30 --disable_indel_quals 2>> {log.bqsr}
    echo "printedreads $(date)"
    rm {wildcards.SM}.dedup-sorted.bam
     echo "removed dedup-sorted $(date)"
    samtools sort -@ 8 {wildcards.SM}.bqsr.bam -o {wildcards.SM}.final.bam
    echo "sorted $(date)"
    rm {wildcards.SM}.bqsr.bam
    samtools view -h -@ 8 -T {params.ref} -C {wildcards.SM}.final.bam > {output.cram}
    echo "bam2cram output$(date)"
    samtools index -@ 8 {output.cram}
    echo "cramindex output$(date)"
    """
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
shell:
    """
        set -x
    wrk=$(pwd)

        # Call variants chr1
        echo -e "\\n\\nCalling chr1\\n" >> {log}
        mkdir -p ${{wrk}}/chr1
        cd ${{wrk}}/chr1
        set -x
        gatk --java-options -Djava.io.tmpdir=. HaplotypeCaller -R {params.ref}  --dbsnp {params.dbSNP}  -I ${{wrk}}/{input.cram} -O ${{wrk}}/chr1/{wildcards.SM}.chr1.g.vcf.gz -L chr1 -ERC GVCF --native-pair-hmm-threads {threads} &>> ../{log}


        # Initialize gVCF and remove temp chrom dir
        if [ -f ${{wrk}}/chr1/{wildcards.SM}.chr1.g.vcf.gz ]
        then
            zcat ${{wrk}}/chr1/{wildcards.SM}.chr1.g.vcf.gz > ${{wrk}}/{wildcards.SM}.g.vcf
            cd ${{wrk}}
            rm -fr chr1
        else
            echo -e "\\nError during variant, exiting\\n"
            cd ..
            rm -fr  ${{wrk}}/{wildcards.SM}.g.vcf
            exit 1
        fi


        # Iteratively haplotype caller and append to gVCF
        for chrom in chr{{2..22}} chr{{X..Y}}
            do

            # Call variants
            echo -e "\\n\\nCalling ${{chrom}}\\n" >> {log}
            mkdir -p ${{wrk}}/${{chrom}}
            cd ${{wrk}}/${{chrom}}
            gatk --java-options -Djava.io.tmpdir=.  HaplotypeCaller -R {params.ref} --dbsnp {params.dbSNP} -I ${{wrk}}/{input.cram} -O ${{wrk}}/${{chrom}}/{wildcards.SM}.${{chrom}}.g.vcf.gz -L ${{chrom}} -ERC GVCF --native-pair-hmm-threads 2 &>> ../{log}

            # Append to gVCF
            zcat ${{wrk}}/${{chrom}}/{wildcards.SM}.${{chrom}}.g.vcf.gz | grep -v "#" >> ${{wrk}}/{wildcards.SM}.g.vcf
            cd ${{wrk}}
            rm -fr ${{chrom}}
        done


        bgzip -c ${{wrk}}/{wildcards.SM}.g.vcf > {output.gvcf}
        tabix -p vcf -f {output.gvcf}


    """
183
184
185
186
187
shell:
    """
    gatk --java-options -Djava.io.tmpdir=. HaplotypeCaller -R {params.ref} --dbsnp {params.dbSNP} -I {input.cram} -O {output.gvcf} -ERC GVCF  -L {params.tgt} --native-pair-hmm-threads {threads} &>> {log}
    tabix -fp vcf {output.gvcf}
    """
ShowHide 9 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/maarten-k/realignment
Name: realignment
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...