Snakemake workflow for highly parallel variant calling designed for ease-of-use in non-model organisms.

public 1yr ago Version: v0.1-alpha.1 0 bookmarks

View Workflow

snpArcher is a reproducible workflow optimized for nonmodel organisms and comparisons across datasets, built on the Snakemake workflow management system. It provides a streamlined approach to dataset acquisition, variant calling, quality control, and downstream analysis.

For usage instructions and complete documentation, please visit our docs .

Code Snippets

shell:
    "gatk HaplotypeCaller "
    "--java-options \"-Xmx{resources.reduced}m\" "
    "-R {input.ref} "
    "-I {input.bam} "
    "-O {output.gvcf} "
    "-L {input.l} "
    "--emit-ref-confidence GVCF --min-pruning {params.minPrun} --min-dangling-branch-length {params.minDang} &> {log}"

SnakeMake gatk From line 31 of rules/bam2vcf_gatk_intervals.smk

shell:
    """
    bcftools concat -D -a -Ou {input.gvcfs} | bcftools sort -T {resources.tmpdir} -Oz -o {output.gvcf} -
    tabix -p vcf {output.gvcf}
    """

SnakeMake BCFtools tabix From line 56 of rules/bam2vcf_gatk_intervals.smk

run:
    with open(output.db_mapfile, "w") as f:
        for file_path in input:
            sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "")
            print(sample_name, file_path, sep="\t", file=f)

SnakeMake From line 70 of rules/bam2vcf_gatk_intervals.smk

shell:
    """
    tar -xf {input.db}
    gatk GenotypeGVCFs \
        --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
        -R {input.ref} \
        --heterozygosity {params.het} \
        --genomicsdb-shared-posixfs-optimizations true \
        -V gendb://{params.db} \
        -O {output.vcf} \
        --tmp-dir {resources.tmpdir} &> {log}
    """

SnakeMake gatk From line 142 of rules/bam2vcf_gatk_intervals.smk

shell:
    "gatk VariantFiltration "
    "-R {input.ref} "
    "-V {input.vcf} "
    "--output {output.vcf} "
    "--filter-name \"RPRS_filter\" "
    "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
    "--filter-name \"FS_SOR_filter\" "
    "--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') &&  SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') &&  SOR > 10.0)))\" "
    "--filter-name \"MQ_filter\" "
    "--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" "
    "--filter-name \"QUAL_filter\" "
    "--filter-expression \"QUAL < 30.0\" "
    "--create-output-variant-index  "
    "--invalidate-previous-filters true &> {log}"

SnakeMake gatk From line 176 of rules/bam2vcf_gatk_intervals.smk

shell:
    """
    bcftools concat -D -a -Ou {input.vcfs} 2> {log}| bcftools sort -T {resources.tmpdir} -Oz -o {output.vcfFinal} - 2>> {log}
    tabix -p vcf {output.vcfFinal} 2>> {log}
    """

SnakeMake BCFtools tabix From line 208 of rules/bam2vcf_gatk_intervals.smk

shell:
    "gatk HaplotypeCaller "
    "--java-options \"-Xmx{resources.reduced}m\" "
    "-R {input.ref} "
    "-I {input.bam} "
    "-O {output.gvcf} "
    "--emit-ref-confidence GVCF --min-pruning {params.minPrun} --min-dangling-branch-length {params.minDang} &> {log}"

SnakeMake gatk From line 31 of rules/bam2vcf_gatk.smk

run:
    with open(output.db_mapfile, "w") as f:
        for file_path in input:
            sample_name = os.path.basename(file_path).replace(".g.vcf.gz", "")
            print(sample_name, file_path, sep="\t", file=f)

SnakeMake From line 47 of rules/bam2vcf_gatk.smk

run:
    with open(output.intervals, "w") as out:
        with open(input.fai, "r") as f:
            for line in f:
                line = line.strip().split()
                chrom, end = line[0], line[1]
                print(f"{chrom}:1-{end}", file=out)

SnakeMake From line 59 of rules/bam2vcf_gatk.smk

shell:
    """
    tar -xf {input.db}
    gatk GenotypeGVCFs \
        --java-options '-Xmx{resources.reduced}m -Xms{resources.reduced}m' \
        -R {input.ref} \
        --heterozygosity {params.het} \
        --genomicsdb-shared-posixfs-optimizations true \
        -V gendb://{params.db} \
        -O {output.vcf} \
        --tmp-dir {resources.tmpdir} &> {log}
    """

SnakeMake gatk From line 128 of rules/bam2vcf_gatk.smk

shell:
    "gatk VariantFiltration "
    "-R {input.ref} "
    "-V {input.vcf} "
    "--output {output.vcf} "
    "--filter-name \"RPRS_filter\" "
    "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
    "--filter-name \"FS_SOR_filter\" "
    "--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') &&  SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') &&  SOR > 10.0)))\" "
    "--filter-name \"MQ_filter\" "
    "--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" "
    "--filter-name \"QUAL_filter\" "
    "--filter-expression \"QUAL < 30.0\" "
    "--create-output-variant-index  "
    "--invalidate-previous-filters true &> {log}"

SnakeMake gatk From line 160 of rules/bam2vcf_gatk.smk

shell:
    """
    bcftools sort -Oz -o {output.vcfFinal} {input.vcf} 2>> {log}
    tabix -p vcf {output.vcfFinal} 2>> {log}
    """

SnakeMake BCFtools tabix From line 192 of rules/bam2vcf_gatk.smk

shell:
    "mosdepth --d4 -t {threads} {params.prefix} {input.bam} &> {log}"

SnakeMake mosdepth From line 21 of rules/cov_filter.smk

shell:
    "d4tools merge {input.d4files} {output} &> {log}"

SnakeMake From line 37 of rules/cov_filter.smk

run:
    covStats = collectCovStats(input.covStatFiles)
    with open(output[0], "w") as f:
        print("chrom\tmean_cov\tstdev_cov", file=f)
        for chrom in covStats:
            print(chrom, covStats[chrom]['mean'], covStats[chrom]['stdev'], sep="\t", file=f)

SnakeMake From line 45 of rules/cov_filter.smk

script:
    "../scripts/create_coverage_bed.py"

SnakeMake From line 67 of rules/cov_filter.smk

shell:
    """
    bedtools sort -i {input.cov} | bedtools merge -d {params.merge} -i - > {output.tmp_cov}
    bedtools intersect -a {output.tmp_cov} -b {input.map} | bedtools sort -i - | bedtools merge -i - > {output.callable_sites}
    """

SnakeMake BEDTools From line 85 of rules/cov_filter.smk

shell:
    "bwa mem -M -t {threads} -R {params.rg} {input.ref} {input.r1} {input.r2} 2> {log} | samtools sort -o {output.bam} - && samtools index {output.bam} {output.bai}"

SnakeMake SAMtools BWA From line 22 of rules/fastq2bam.smk

shell:
    "samtools merge {output.bam} {input} && samtools index {output.bam} > {log}"

SnakeMake SAMtools From line 39 of rules/fastq2bam.smk

shell:
    "sambamba markdup -t {threads} {input.bam} {output.dedupBam} 2> {log}"

SnakeMake Sambamba From line 57 of rules/fastq2bam.smk

shell:
    """
    set +e
    #delete existing prefetch file in case of previous run failure
    rm -rf {wildcards.run}
    ##attempt to get SRA file from NCBI (prefetch) or ENA (wget)
    prefetch --max-size 1T {wildcards.run}
    prefetchExit=$?
    if [[ $prefetchExit -ne 0 ]]
    then
        ffq --ftp {wildcards.run} | grep -Eo '"url": "[^"]*"' | grep -o '"[^"]*"$' | grep "fastq" | xargs curl --remote-name-all --output-dir {params.outdir}
    else
        fasterq-dump {wildcards.run} -O {params.outdir} -e {threads} -t {resources.tmpdir}
        pigz -p {threads} {params.outdir}{wildcards.run}*.fastq
    fi
    rm -rf {wildcards.run}
    """

SnakeMake From line 16 of rules/fastq.smk

shell:
    "fastp --in1 {input.r1} --in2 {input.r2} "
    "--out1 {output.r1} --out2 {output.r2} "
    "--thread {threads} "
    "--detect_adapter_for_pe "
    "-j /dev/null -h /dev/null "
    "2> {output.summ} > {log}"

SnakeMake fastp From line 51 of rules/fastq.smk

shell:
    "picard ScatterIntervalsByNs REFERENCE={input.ref} OUTPUT={output.intervals} MAX_TO_MERGE={params.minNmer} OUTPUT_TYPE=ACGT &> {log}\n"

SnakeMake Picard From line 18 of rules/intervals.smk

run:
    with open(output.intervals, "w") as out:
        with open(input.intervals, "r") as inp:
            for line in inp:
                if not line.startswith("@"):
                    line = line.strip().split("\t")
                    chrom, start, end = line[0], line[1], line[2]
                    print(f"{chrom}:{start}-{end}", file=out)

SnakeMake From line 26 of rules/intervals.smk

shell:
    """
    gatk SplitIntervals -L {input.intervals} \
    -O {output.out_dir} -R {input.ref} -scatter {params} \
    -mode INTERVAL_SUBDIVISION \
    --interval-merging-rule OVERLAPPING_ONLY &> {log}
    ls -l {output.out_dir}/*scattered.interval_list > {output.fof}
    """

SnakeMake gatk From line 55 of rules/intervals.smk

shell:
    """
    gatk SplitIntervals -L {input.intervals} \
    -O {output.out_dir} -R {input.ref} -scatter {params} \
    -mode BALANCING_WITHOUT_INTERVAL_SUBDIVISION \
    --interval-merging-rule OVERLAPPING_ONLY  &> {log}
    ls -l {output.out_dir}/*scattered.interval_list > {output.fof}
    """

SnakeMake gatk From line 83 of rules/intervals.smk

shell:
    """
    awk 'BEGIN{{OFS="\\t";FS="\\t"}} {{ if($4>={params.mappability}) print $1,$2,$3 }}' {input.map} > {output.tmp_map}
    bedtools sort -i {output.tmp_map} | bedtools merge -d {params.merge} -i - > {output.callable_sites}
    """

SnakeMake BEDTools From line 44 of rules/mappability.smk

shell:
    """
    if [ -z "{input.ref}" ]  # check if this is empty
    then
        mkdir -p {params.outdir}
        datasets download genome accession --exclude-gff3 --exclude-protein --exclude-rna --filename {params.dataset} {wildcards.refGenome} \
        && 7z x {params.dataset} -aoa -o{params.outdir} \
        && cat {params.outdir}/ncbi_dataset/data/{wildcards.refGenome}/*.fna > {output.ref}
    else
        cp {input.ref} {output.ref}
    fi
    """

SnakeMake From line 17 of rules/reference.smk

shell:
    """
    bwa index {input.ref} 2> {log}
    samtools faidx {input.ref} --output {output.fai} >> {log}
    samtools dict {input.ref} -o {output.dictf} >> {log} 2>&1
    """

SnakeMake SAMtools BWA From line 44 of rules/reference.smk

shell:
    """
    export MALLOC_CONF=lg_dirty_mult:-1
    export SENTIEON_LICENSE={params.lic}
    sentieon bwa mem -M -R {params.rg} -t {threads} -K 10000000 {input.ref} {input.r1} {input.r2} | sentieon util sort --bam_compression 1 -r {input.ref} -o {output.bam} -t {threads} --sam2bam -i -
    samtools index {output.bam} {output.bai}
    """

SnakeMake SAMtools BWA Sentieon DNASeq From line 23 of rules/sentieon.smk

shell:
    "samtools merge {output.bam} {input} && samtools index {output.bam}"

SnakeMake SAMtools From line 44 of rules/sentieon.smk

shell:
    """
    export SENTIEON_LICENSE={params.lic}
    sentieon driver -t {threads} -i {input.bam} --algo LocusCollector --fun score_info {output.score}
    sentieon driver -t {threads} -i {input.bam} --algo Dedup --score_info {output.score} --metrics {output.metrics} --bam_compression 1 {output.dedupBam}
    """

SnakeMake Sentieon DNASeq From line 68 of rules/sentieon.smk

shell:
    """
    export SENTIEON_LICENSE={params.lic}
    sentieon driver -r {input.ref} -t {threads} -i {input.bam} --algo Haplotyper --genotype_model multinomial --emit_mode gvcf --emit_conf 30 --call_conf 30 {output.gvcf} 2> {log}
    """

SnakeMake Sentieon DNASeq From line 97 of rules/sentieon.smk

shell:
    """
    export SENTIEON_LICENSE={params.lic}
    sentieon driver -r {input.ref} -t {threads} --algo GVCFtyper --emit_mode VARIANT {output.vcf} {params.glist} 2> {log}
    """

SnakeMake Sentieon DNASeq From line 126 of rules/sentieon.smk

shell:
    "gatk VariantFiltration "
    "-R {input.ref} "
    "-V {input.vcf} "
    "--output {output.vcf} "
    "--filter-name \"RPRS_filter\" "
    "--filter-expression \"(vc.isSNP() && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -8.0)) || ((vc.isIndel() || vc.isMixed()) && (vc.hasAttribute('ReadPosRankSum') && ReadPosRankSum < -20.0)) || (vc.hasAttribute('QD') && QD < 2.0)\" "
    "--filter-name \"FS_SOR_filter\" "
    "--filter-expression \"(vc.isSNP() && ((vc.hasAttribute('FS') && FS > 60.0) || (vc.hasAttribute('SOR') &&  SOR > 3.0))) || ((vc.isIndel() || vc.isMixed()) && ((vc.hasAttribute('FS') && FS > 200.0) || (vc.hasAttribute('SOR') &&  SOR > 10.0)))\" "
    "--filter-name \"MQ_filter\" "
    "--filter-expression \"vc.isSNP() && ((vc.hasAttribute('MQ') && MQ < 40.0) || (vc.hasAttribute('MQRankSum') && MQRankSum < -12.5))\" "
    "--filter-name \"QUAL_filter\" "
    "--filter-expression \"QUAL < 30.0\" "
    "--invalidate-previous-filters true &> {log}"

SnakeMake gatk From line 153 of rules/sentieon.smk

shell:
    """
    samtools coverage --output {output.cov} {input.bam}
    samtools flagstat -O tsv {input.bam} > {output.alnSum}
    """

SnakeMake SAMtools From line 13 of rules/sumstats.smk

shell:
    """
    export SENTIEON_LICENSE={params.lic}
    sentieon driver -r {input.ref} \
    -t {threads} -i {input.bam} \
    --algo MeanQualityByCycle {output.mq} \
    --algo QualDistribution {output.qd} \
    --algo GCBias --summary {output.gc_summary} {output.gc} \
    --algo InsertSizeMetricAlgo {output.insert_file}
    """

SnakeMake Sentieon DNASeq From line 35 of rules/sumstats.smk

shell:
    "cat {input} > {output}"

SnakeMake From line 51 of rules/sumstats.smk

run:
    if not config['sentieon']:
        FractionReadsPassFilter, NumReadsPassFilter = collectFastpOutput(input.fastpFiles)
        aln_metrics = collectAlnSumMets(input.alnSumMetsFiles)
        SeqDepths, CoveredBases = collectCoverageMetrics(input.coverageFiles)
        printBamSumStats(SeqDepths, CoveredBases, aln_metrics, FractionReadsPassFilter, NumReadsPassFilter, output[0])
    else:
        FractionReadsPassFilter, NumReadsPassFilter = collectFastpOutput(input.fastpFiles)
        aln_metrics = collectAlnSumMets(input.alnSumMetsFiles)
        SeqDepths, CoveredBases = collectCoverageMetrics(input.coverageFiles)
        median_inserts, median_insert_std = collect_inserts(input.insert_files)

SnakeMake Sentieon DNASeq From line 59 of rules/sumstats.smk

from pyd4 import D4File,D4Builder
import math

#read chrom coverage values and compute min/max
cov_thresh = {}
stdv_scale = snakemake.params["cov_threshold_stdev"]
rel_scale = snakemake.params["cov_threshold_rel"]
mean_lower = snakemake.params["cov_threshold_lower"]
mean_upper = snakemake.params["cov_threshold_upper"]

#check that correct settings are set

if stdv_scale:
    if rel_scale:
        raise WorkflowError(f"Both cov_threshold_stdev and cov_threshold_rel are set, please choose one and make sure the other variable is empty in the config file.")
    elif mean_lower or mean_upper:
        raise WorkflowError(f"Both cov_threshold_stdev and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file.")
elif rel_scale:
    if mean_lower or mean_upper:
         raise WorkflowError(f"Both cov_threshold_rel and cov_threshold_lower/cov_threshold_upper are set, please choose one and make sure the other variable is empty in the config file.")
elif mean_lower:
    if not mean_upper:
        mean_upper = 50000
elif mean_upper:
    if not mean_lower:
        mean_lower = 1
else:
    raise WorkflowError(f"Use coverage filter is True, but you did not specify coverage filtering options in the config. Please check.")

with open(snakemake.input["stats"]) as stats:
    for line in stats:
        if "mean" in line:
            continue

        fields=line.split()
        mean = float(fields[1])
        stdev = math.sqrt(mean)
        #0 is chr, 1 is mean
        if stdv_scale:
            cov_thresh[fields[0]] = {
                'low' : mean - (stdev * float(stdv_scale)),
                'high' : mean + (stdev * float(stdv_scale))
                }
        elif rel_scale: 
            cov_thresh[fields[0]] = {
                'low' : mean / float(rel_scale),
                'high' : mean * float(rel_scale)
                }
        else:
            cov_thresh[fields[0]] = {
                'low' : float(mean_lower),
                'high' : float(mean_upper)
                }

#read d4 file into python, convert to
covfile = D4File(snakemake.input["d4"])
covmat = covfile.open_all_tracks()

with open(snakemake.output["covbed"], mode='w') as covbed:
    good_interval = False
    for chrom in covfile.chroms():

        try: 
            thresh_high = cov_thresh[chrom[0]]['high']
        except KeyError:
            thresh_high = cov_thresh['total']['high']

        try:
            thresh_low = cov_thresh[chrom[0]]['low']
        except KeyError:
            thresh_low = cov_thresh['total']['low']

        for values in covmat.enumerate_values(chrom[0],0,chrom[1]):
            covs=values[2]
            pos=values[1]
            chr=values[0]
            #get mean coverage for window
            res1=math.fsum(covs)/len(covs)

            if res1 <= thresh_high and res1 >= thresh_low and good_interval == False:
                # we are starting a new interval, print chr and pos
                print(chr, pos, file=covbed, sep="\t", end="")
                # set good interval to True
                good_interval = True
            elif (res1 > thresh_high or res1 < thresh_low) and good_interval:
                # we are ending a good interval
                print("\t", pos, file=covbed, sep="")
                good_interval = False
            else:
                # we are either in a good interval, or in a bad interval, so just keep going
                continue
        # if at this stage we are in a good interval, that means the good interval goes ot the end of the chromosome
        if good_interval:
            endpos = chrom[1]+1
            print("\t", endpos, file=covbed, sep="")
            good_interval = False