A reproducible and scalable snakemake workflow for the analysis of DNA metabarcoding experiments, with a special focus on food and feed samples.

public public 1yr ago Version: 1.6.6 0 bookmarks

FooDMe is a reproducible and scalable snakemake workflow for the analysis of DNA metabarcoding experiments, with a special focus on food and feed samples.

Usage

The documentation for this workflow is hosted on our homepage . If you use this workflow for research, you can cite this repo using the DOI above.

This workflow support snakemake´s standardized usage and is referenced in the snakemake workflows catalog .

Code Snippets

20
21
script:
    "../scripts/filter_taxonomy.py"
36
37
38
39
40
41
42
43
44
shell:
    """
    exec 2> {log}

    export BLASTDB={params.taxdb}

    blastdbcmd -db {params.blast_DB} -tax_info -outfmt %T \
    > {output.taxlist}
    """
63
64
script:
    "../scripts/make_blast_mask.py"
79
80
script:
    "../scripts/apply_blocklist.py"
92
93
94
95
shell:
    """
    touch {output.mask} 2> {log}
    """
107
108
109
110
shell:
    """
    touch {output.block} > {log}
    """
SnakeMake From line 107 of rules/blast.smk
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
shell:
    """
    export BLASTDB={params.taxdb}

    if [ {input.mask} = "common/blast_mask.txt" ]
    then
        masking="-taxidlist common/blast_mask.txt"
    else
        masking=""
    fi

    blastn -db {params.blast_DB} \
        -query {input.query} \
        -out {output.report} \
        -task 'megablast' \
        -evalue {params.e_value} \
        -perc_identity {params.perc_identity} \
        -qcov_hsp_perc {params.qcov} $masking \
        -outfmt '6 qseqid sseqid evalue pident bitscore sacc staxid length mismatch gaps stitle' \
        -num_threads {threads} \
    2> {log} 

    sed -i '1 i\query\tsubject\tevalue\tidentity\tbitscore\tsubject_acc\tsubject_taxid\talignment_length\tmismatch\tgaps\tsubject_name' {output.report}
    """
176
177
178
179
180
181
182
183
184
shell:
    """
    exec 2> {log}
    if [ -s {input.report} ]; then
      grep -v -f {params.acc_list} {input.report} > {output.report}
    else
      touch {output.report}
    fi
    """
SnakeMake From line 176 of rules/blast.smk
200
201
script:
    "../scripts/filter_blast.py"
SnakeMake From line 200 of rules/blast.smk
218
219
script:
    "../scripts/min_consensus_filter.py"
SnakeMake From line 218 of rules/blast.smk
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
shell:
    """
    exec 2> {log}

    if [ -s {input.blast} ]
    then
        # Get list of all OTUs
        OTUs=$(grep "^>" {input.otus} | cut -d";" -f1 | tr -d '>' | sort -u)

        for otu in $OTUs
        do
            size=$(grep -E "^>${{otu}}\>" {input.otus}  | cut -d"=" -f2)
            bhits=$(grep -c -E "^${{otu}};" {input.blast} || true)
            if [ $bhits -eq 0 ]
            then
                # When there is no blast hit
                echo "{wildcards.sample}\t$otu\t$size\t0\t0\t0\t0\t0\t-\t-\t-\t- (1.0)\t../{input.blast}\t../{input.filtered}" >> {output}
            else
                # Otherwise collect and print stats to file
                bit_best=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -rn | head -n1)
                bit_low=$(grep -E "^${{otu}};" {input.blast} | cut -f5 | cut -d. -f1 | sort -n | head -n1)
                bit_thr=$(($bit_best - {params.bit_diff}))
                shits=$(grep -c -E "^${{otu}}\>" {input.filtered})
                cons=$(grep -E "^${{otu}}\>" {input.lca} | cut -d'\t' -f2-5)

                echo "{wildcards.sample}\t$otu\t$size\t$bhits\t$bit_best\t$bit_low\t$bit_thr\t$shits\t$cons\t../{input.blast}\t../{input.filtered}" >> {output}
            fi
        done
        # Sort by size and add header (just to get hits on top)
        sort -k3,3nr -o {output} {output}
        sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output}

    else
        echo "{wildcards.sample}\t-\t-\t0\t0\t0\t0\t0\t-\t-\t-\t-\t../{input.blast}\t../{input.filtered}" > {output}
        sed -i '1 i\Sample\tQuery\tCount\tBlast hits\tBest bit-score\tLowest bit-score\tBit-score threshold\tSaved Blast hits\tConsensus\tRank\tTaxid\tDisambiguation\tlink_report\tlink_filtered' {output}
    fi
    """
297
298
299
300
301
302
303
shell:
    """
    head -n 1 {input.report[0]} > {output.agg}
    for i in {input.report}; do 
      cat ${{i}} | tail -n +2 >> {output.agg}
    done
    """
SnakeMake From line 297 of rules/blast.smk
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
shell:
    """
    exec 2> {log}

    echo "Sample\tQuery\tUnknown sequences\tUnknown sequences [%]\t(Sub-)Species consensus\t(Sub-)Species consensus [%]\tGenus consensus\tGenus consensus [%]\tFamily consensus\tFamily consensus [%]\tHigher rank consensus\tHigher rank consensus [%]" > {output}

    all=$(grep -c -E "OTU_|ASV_" <(tail -n +2 {input}) || true)
    nohits=$(grep -c "[[:blank:]]-[[:blank:]]" {input} || true)
    spec=$(grep -c "species" {input} || true)
    gen=$(grep -c "genus" {input} || true)
    fam=$(grep -c "family" {input} || true)
    other=$(( $all - $nohits - $spec - $gen - $fam ))

    if [ $all -ne 0 ]
    then
        nohits_perc=$(printf %.2f "$((10**3 * (100* $nohits / $all)))e-3")
        spec_perc=$(printf %.2f "$((10**3 * (100* $spec / $all)))e-3")
        gen_perc=$(printf %.2f "$((10**3 * (100* $gen / $all)))e-3")
        fam_perc=$(printf %.2f "$((10**3 * (100* $fam / $all)))e-3")
        other_perc=$(printf %.2f "$((10**3 * (100* $other / $all)))e-3")

        echo "{wildcards.sample}\t$all\t$nohits\t$nohits_perc\t$spec\t$spec_perc\t$gen\t$gen_perc\t$fam\t$fam_perc\t$other\t$other_perc" >> {output}

    else
        echo "{wildcards.sample}\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0\t0" >> {output}
    fi
    """
364
365
366
367
368
369
370
371
shell:
    """
    exec 2> {log}
    cat {input.report[0]} | head -n 1 > {output.agg}
    for i in {input.report}; do 
        cat ${{i}} | tail -n +2 >> {output.agg}
    done
    """
SnakeMake From line 364 of rules/blast.smk
392
393
script:
    "../scripts/summarize_results.py"
SnakeMake From line 392 of rules/blast.smk
412
413
414
415
416
417
418
419
420
shell:
    """
    exec 2> {log}

    cat {input.report[0]} | head -n 1 > {output.agg}
    for i in {input.report}; do 
        cat ${{i}} | tail -n +2 >> {output.agg}
    done
    """
SnakeMake From line 412 of rules/blast.smk
20
21
script:
    "../scripts/krona_table.py"
40
41
shell:
    "ktImportText -o {output.graph} {input.table} 2> {log}"
62
63
64
65
66
67
68
69
70
71
72
73
shell:
    """
    exec 2> {log}
    i=0
    for file in {input.report}
    do
        file_list[$i]="${{file}},$(echo ${{file}} | cut -d'/' -f1)"
        ((i+=1))
    done

    ktImportText -o {output.agg} ${{file_list[@]}}
    """
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
shell:
    """
    exec 2> {log}

    if [[ {params.method} == "otu" ]] 
    then
        echo "Sample\tQ30 rate\tInsert size peak\tRead number\tPseudo-reads\tReads in OTU\tOTU number\tAssigned reads\t(Sub-)Species consensus\tGenus consensus\tHigher rank consensus\tNo match" > {output.report}

        Q30=$(tail -n +2 {input.fastp} | cut -d'\t' -f9)
        size=$(tail -n +2 {input.fastp} | cut -d'\t' -f11)
        reads=$(tail -n +2 {input.merging} | cut -d'\t' -f2)
        pseudo=$(tail -n +2 {input.merging} | cut -d'\t' -f5)
        clustered=$(tail -n +2 {input.clustering} | cut -d'\t' -f10)
        otu=$(tail -n +2 {input.tax} | cut -d'\t' -f2)
        assigned=$(tail -n +2 {input.compo} | awk '$2 != "No match"' | cut -d'\t' -f5 | awk '{{s+=$1}}END{{print s}}')
        spec=$(tail -n +2 {input.tax} | cut -d'\t' -f5)
        gen=$(tail -n +2 {input.tax} | cut -d'\t' -f7)
        high=$(($(tail -n +2 {input.tax} | cut -d'\t' -f9) + $(tail -n +2 {input.tax} | cut -d'\t' -f11)))
        none=$(tail -n +2 {input.tax} | cut -d'\t' -f3)

        echo "{wildcards.sample}\t$Q30\t$size\t$reads\t$pseudo\t$clustered\t$otu\t$assigned\t$spec\t$gen\t$high\t$none" >> {output.report}
    else
        echo "Sample\tQ30 rate\tInsert size peak\tRead number\tPseudo-reads\tReads in ASV\tASV number\tAssigned reads\t(Sub-)Species consensus\tGenus consensus\tHigher rank consensus\tNo match" > {output.report}

        Q30=$(tail -n +2 {input.fastp} | cut -d'\t' -f9)
        size=$(tail -n +2 {input.fastp} | cut -d'\t' -f11)
        reads=$(tail -n +2 {input.clustering} | cut -d'\t' -f2)
        pseudo=$(tail -n +2 {input.clustering} | cut -d'\t' -f6)
        clustered=$(tail -n +2 {input.clustering} | cut -d'\t' -f16)
        otu=$(tail -n +2 {input.tax} | cut -d'\t' -f2)
        assigned=$(tail -n +2 {input.compo} | awk '$2 != "No match"' | cut -d'\t' -f5 | awk '{{s+=$1}}END{{print s}}')
        spec=$(tail -n +2 {input.tax} | cut -d'\t' -f5)
        gen=$(tail -n +2 {input.tax} | cut -d'\t' -f7)
        high=$(($(tail -n +2 {input.tax} | cut -d'\t' -f9) + $(tail -n +2 {input.tax} | cut -d'\t' -f11)))
        none=$(tail -n +2 {input.tax} | cut -d'\t' -f3)

        echo "{wildcards.sample}\t$Q30\t$size\t$reads\t$pseudo\t$clustered\t$otu\t$assigned\t$spec\t$gen\t$high\t$none" >> {output.report}
    fi
    """
156
157
158
159
160
161
162
163
shell:
    """
    exec 2> {log}
    cat {input.report[0]} | head -n 1 > {output.agg}
    for i in {input.report}; do 
        cat ${{i}} | tail -n +2 >> {output.agg}
    done
    """
204
205
script:
    "../scripts/write_report.Rmd"
243
244
script:
    "../scripts/write_report.Rmd"
261
262
script:
    "../scripts/conda_collector.py"
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
shell:
    """
    exec 2> {log}

    echo "Database\tLast modified\tFull path" \
        > {output.report}

    paste \
        <(echo "BLAST") \
        <(date +%F -r {params.blast}.nto) \
        <(echo {params.blast}) \
        >> {output.report}

    paste \
        <(echo "taxdb.bti") \
        <(date +%F -r {params.taxdb}/taxdb.bti) \
        <(echo {params.taxdb}/taxdb.bti) \
        >> {output.report}

    paste \
        <(echo "taxdb.btd") \
        <(date +%F -r {params.taxdb}/taxdb.btd) \
        <(echo {params.taxdb}/taxdb.btd) \
        >> {output.report}

    paste \
        <(echo "taxdump lineages") \
        <(date +%F -r {params.taxdump_lin}) \
        <(echo {params.taxdump_lin}) \
        >> {output.report}

    paste \
        <(echo "taxdump nodes") \
        <(date +%F -r {params.taxdump_nodes}) \
        <(echo {params.taxdump_nodes}) \
        >> {output.report}
    """
16
17
script:
    "../scripts/primer_disambiguation.py"
31
32
33
34
shell:
    """
    seqtk seq -r {input.primers} 1> {output.primers_rc} 2> {log}
    """
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
shell:
    """
    # Simple case only 5p trimming
    if [[ {params.primer_3p} == False ]]
    then
        cutadapt {input.r1} \
            {input.r2} \
            -o {output.r1} \
            -p {output.r2} \
            -g file:{params.primers} \
            -G file:{params.primers} \
            --untrimmed-output {output.trash_R1_5p} \
            --untrimmed-paired-output {output.trash_R2_5p} \
            --error-rate {params.error_rate} \
            2>&1 > {log}
        touch {output.trash_R1_3p}
        touch {output.trash_R2_3p}

    # in case trimming of 3p is also nescessary
    else
        cutadapt --interleaved \
            {input.r1} \
            {input.r2} \
            -g file:{params.primers} \
            -G file:{params.primers} \
            --untrimmed-output {output.trash_R1_5p} \
            --untrimmed-paired-output {output.trash_R2_5p} \
            --error-rate {params.error_rate} \
        2>> {log} \
        | cutadapt --interleaved \
            -o {output.r1} \
            -p {output.r2} \
            -a file:{input.primers_rc} \
            -A file:{input.primers_rc} \
            --untrimmed-output {output.trash_R1_3p} \
            --untrimmed-paired-output {output.trash_R2_3p} \
            --error-rate {params.error_rate} \
            - \
        2>&1 >> {log}
    fi
    """
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
shell:
    """
    exec 2> {log}

    before_r1=$(zcat {input.before_r1} | echo $((`wc -l`/4)))
    after_r1=$(zcat {input.after_r1} | echo $((`wc -l`/4)))
    before_r2=$(zcat {input.before_r2} | echo $((`wc -l`/4)))
    after_r2=$(zcat {input.after_r2} | echo $((`wc -l`/4)))

    before=$(( before_r1 + before_r2 ))
    after=$(( after_r1 + after_r2 ))

    if [ $after -ne 0 ] 
    then
        perc_discarded=$( python -c "print(f'{{round(100*(1-${{after}}/${{before}}),2)}}')" )
    else
        perc_discarded=0.00
    fi

    echo "Sample\tTotal raw reads\tTotal reads after primer trim\tNo primer found [%]" > {output.report}
    echo "{wildcards.sample}\t$before\t$after\t$perc_discarded" >> {output.report}
    """
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
shell:
    """
    fastp -i {input.r1} -I {input.r2} \
        -o {output.r1} -O {output.r2} \
        -h {output.html} -j {output.json}\
        --length_required {params.length_required} \
        --qualified_quality_phred {params.qualified_quality_phred} \
        --cut_by_quality3 \
        --cut_window_size {params.window_size} \
        --cut_mean_quality {params.mean_qual} \
        --disable_adapter_trimming \
        --thread {threads} \
        --report_title 'Sample {wildcards.sample}' \
    > {log} 2>&1
    """
193
194
script:
    "../scripts/parse_fastp.py"
212
213
214
215
shell:
    """
    paste {input.cutadapt} {input.fastp} 1> {output.report} 2> {log}
    """
233
234
235
236
237
238
239
240
241
shell:
    """
    exec 2> {log}

    cat {input.report[0]} | head -n 1 > {output.agg}
    for i in {input.report}; do 
        cat ${{i}} | tail -n +2 >> {output.agg}
    done
    """
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import sys


sys.stderr = open(snakemake.log[0], "w")


def main(taxids, blocklist, output):
    with open(taxids, 'r') as fi:
        taxs = set([line.strip() for line in fi.readlines()])

    with open(blocklist, 'r') as bl:
        blocks = set([line.split('#')[0].strip() for line in bl.readlines()])

    listout = taxs.difference(blocks)

    with open(output, 'w') as fo:
        for tax in listout:
            fo.write(f"{tax}\n")


if __name__ == '__main__':
    main(snakemake.input["taxids"],
         snakemake.input["blocklist"],
         snakemake.output['mask'])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import sys


sys.stderr = open(snakemake.log[0], "w")


import os
import yaml
import pandas as pd


def extract_package_version(envfile):
    with open(envfile, 'r') as stream:
        env = yaml.safe_load(stream)
        for dep in env['dependencies']:
            p, v = dep.split("=")
            yield p, v


def main(report, basedir):
    mypath = os.path.join(basedir, "envs")
    envs = [
        os.path.join(mypath, f) for f in os.listdir(mypath)
        if os.path.isfile(os.path.join(mypath, f)) and f.lower().endswith(('.yaml', '.yml'))
    ]
    df = []
    for ef in envs:
        for p, v in extract_package_version(ef):
            df.append({'Package': p, 'Version': v})
    df = pd.DataFrame(df)
    df.sort_values('Package').to_csv(report, sep="\t", header=True, index=False)


if __name__ == '__main__':
    main(
        report=snakemake.output['report'],
        basedir=snakemake.params['dir']
    )
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import sys


sys.stderr = open(snakemake.log[0], "w")


from os import stat
import pandas as pd


def main(report, filtered, bit_diff):
    if stat(report).st_size == 0:
        with open(filtered, "w") as fout:
            fout.write(
                "query\tsubject\tevalue\tidentity\tbitscore\tsubject_acc\t"
                "subject_taxid\talignment_length\tmismatch\tgaps\tsubject_name"
            )
    else:
        df = pd.read_csv(report, sep="\t", header=0)
        if df.empty:
            df.to_csv(filtered, sep="\t", header=True, index=False)
        else:
            sd = dict(tuple(df.groupby("query")))
            dfout = pd.DataFrame()
            for key, val in sd.items():
                dfout = pd.concat(
                    [dfout, val[val["bitscore"] >= max(val["bitscore"]) - bit_diff]]
                )
            dfout["query"] = dfout["query"].str.split(";").str[0]
            dfout.to_csv(filtered, sep="\t", header=True, index=False)


if __name__ == '__main__':
    main(snakemake.input['report'],
         snakemake.output['filtered'],
         snakemake.params['bit_diff'])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import sys


sys.stderr = open(snakemake.log[0], "w")


import taxidTools as txd


def main(nodes, lineage, taxid, out):
    tax = txd.Taxonomy.from_taxdump(nodes, lineage)
    tax.prune(taxid)
    tax.write(out)


if __name__ == '__main__':
    main(snakemake.params['nodes'],
         snakemake.params['rankedlineage'],
         snakemake.params['taxid'],
         snakemake.output['tax'])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import sys


sys.stderr = open(snakemake.log[0], "w")


import taxidTools as txd
import pandas as pd


def get_lineage(taxid, tax):
    if taxid == "-":
        return ["Unassigned"]
    elif taxid == "Undetermined":
        return ["Undetermined"]
    else:
        return [node.name for node in tax.getAncestry(taxid)][::-1]
        # inverting list to have the lineage descending for Krona


def main(input, output, taxonomy):
    tax = txd.load(taxonomy)
    df = pd.read_csv(input, sep='\t', header=0)
    with open(output, "w") as out:
        for index, row in df.iterrows():
            out.write(
                "\t".join(
                    [str(row["Count"])] + get_lineage(row['Taxid'], tax)
                ) + "\n")


if __name__ == '__main__':
    main(snakemake.input['compo'],
         snakemake.output['krt'],
         snakemake.input['tax'])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import sys


sys.stderr = open(snakemake.log[0], "w")


import taxidTools as txd


def main(taxid_file, parent, output, taxonomy):

    tax = txd.load(taxonomy)

    with open(taxid_file, "r") as fin:
        db_entries = set(fin.read().splitlines()[1:])

    with open(output, "w") as fout:
        for taxid in db_entries:
            try:
                if tax.isDescendantOf(str(taxid).strip(), str(parent).strip()):
                    fout.write(taxid + "\n")
                else:
                    pass
            except KeyError:
                pass  # Ignoring missing taxids as they are either not in the
                # taxdumps or actively filtered by the user.


if __name__ == '__main__':
    main(snakemake.input['taxlist'],
         snakemake.params["taxid"],
         snakemake.output['mask'],
         snakemake.input['tax'])
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
import sys


sys.stderr = open(snakemake.log[0], "w")


import taxidTools as txd
from collections import Counter, defaultdict


def parse_blast(blast_file):
    """
    Parse a BLAST report and returns a dictionnary where Keys are query
    sequence names and values list of taxids for each hit.
    BLAST report must have the following formatting:
        '6 qseqid sseqid evalue pident bitscore sacc
        staxids sscinames scomnames stitle'
    """
    dictout = defaultdict()
    with open(blast_file, 'r') as fi:
        next(fi)  # Skip header
        for line in fi:
            ls = line.split()
            taxids = ls[6].split(";")  # split taxids if nescessary
            # extend taxids list for this OTU
            if ls[0] in dictout.keys():
                dictout[ls[0]].extend(taxids)
            # or inititate the list
            else:
                dictout[ls[0]] = taxids

    # Make sure everything is str formated
    dictout = {k: [str(e) for e in v] for k, v in dictout.items()}

    return dictout


def main(blast_report, output, min_consensus, taxonomy):
    if min_consensus <= 0.5 or min_consensus > 1:
        raise ValueError("'min_consensus' must be in the interval (0.5 , 1]")

    tax = txd.load(taxonomy)
    otu_dict = parse_blast(blast_report)
    with open(output, 'w') as out:
        out.write("queryID\tConsensus\tRank\tTaxid\tDisambiguation\n")

        for queryID, taxid_list in otu_dict.items():
            try:
                consensus = tax.consensus(taxid_list, min_consensus)

            except KeyError:
                # Taxid not present in the Taxdump version
                # used raises a KeyError
                # Filter out missing sequences (verbose)
                taxid_list_new = []
                for taxid in taxid_list:
                    if taxid not in tax.keys():
                        pass  # This is most likely the result of active filtering by the user
                        # No need ot be over verbose with this
                        # print(f"WARNING: taxid {taxid} missing from Taxonomy "
                        #      f"reference, it will be ignored")
                    else:
                        taxid_list_new.append(taxid)

                # Update list
                taxid_list = taxid_list_new

                # Empty list case:
                if not taxid_list:
                    consensus = "Undetermined"
                else:
                    # Get the consensus with the filtered taxids
                    consensus = tax.consensus(taxid_list, min_consensus)

            finally:
                if consensus != "Undetermined":
                    rank = consensus.rank
                    name = consensus.name
                    taxid = consensus.taxid
                else:
                    taxid = "Undetermined"
                    rank = "Undetermined"
                    name = "Undetermined"

                # (freq, name) tuple to sort
                freqs = [((v/len(taxid_list)), tax.getName(k))
                         for k, v in Counter(taxid_list).items()]
                sorted_freqs = sorted(freqs, reverse=True)

                names = "; ".join([f"{f} ({round(n,2)})"
                                   for (n, f) in sorted_freqs])
                out.write(f"{queryID}\t{name}\t{rank}\t{taxid}\t{names}\n")


if __name__ == '__main__':
    main(snakemake.input['blast'],
         snakemake.output['consensus'],
         snakemake.params["min_consensus"],
         snakemake.input['tax'])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import sys


sys.stderr = open(snakemake.log[0], "w")


import os
import json
import csv


def main(injson, inhtml, outtsv):
    with open(injson, "r") as handle:
        data = json.load(handle)
        link_path = os.path.join("..", inhtml)
        header = (
            "Total bases before quality trim\tTotal reads after quality trim"
            "\tTotal bases after quality trim\tQ20 rate after\tQ30 rate after"
            "\tDuplication rate\tInsert size peak\tlink_to_report"
        )
        datalist = [
            data["summary"]["before_filtering"]["total_bases"],
            data["summary"]["after_filtering"]["total_reads"],
            data["summary"]["after_filtering"]["total_bases"],
            data["summary"]["after_filtering"]["q20_rate"],
            data["summary"]["after_filtering"]["q30_rate"],
            data["duplication"]["rate"],
            data["insert_size"]["peak"],
            link_path,
        ]
    with open(outtsv, "w") as outfile:
        outfile.write(f"{header}\n")
        writer = csv.writer(outfile, delimiter="\t")
        writer.writerow(datalist)


if __name__ == '__main__':
    main(snakemake.input['json'],
         snakemake.input['html'],
         snakemake.output['tsv'])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys


sys.stderr = open(snakemake.log[0], "w")


from Bio import SeqIO
from itertools import product


def extend_ambiguous_dna(seq):
    """return list of all possible sequences given an ambiguous DNA input"""
    d = {
        'A': 'A',
        'C': 'C',
        'G': 'G',
        'T': 'T',
        'M': ['A', 'C'],
        'R': ['A', 'G'],
        'W': ['A', 'T'],
        'S': ['C', 'G'],
        'Y': ['C', 'T'],
        'K': ['G', 'T'],
        'V': ['A', 'C', 'G'],
        'H': ['A', 'C', 'T'],
        'D': ['A', 'G', 'T'],
        'B': ['C', 'G', 'T'],
        'N': ['G', 'A', 'T', 'C']
    }
    return list(map("".join, product(*map(d.get, seq))))


def primers_to_fasta(name, seq_list):
    """return fasta string of primers with tracing newline"""
    fas = ""
    for i in range(len(seq_list)):
        fas += f">{name}[{i}]\n{seq_list[i]}\n"
    return fas


def main(fastain, fastaout):
    with open(fastain, "r") as fin, open(fastaout, "w") as fout:
        for record in SeqIO.parse(fin, "fasta"):
            explicit = extend_ambiguous_dna(record.seq)
            fasta = primers_to_fasta(record.id, explicit)
            fout.write(fasta)


if __name__ == '__main__':
    main(snakemake.params['primers'],
         snakemake.output['primers'])
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sys


sys.stderr = open(snakemake.log[0], "w")


import pandas as pd


def concatenate_uniq(entries):
    s = "; ".join(entries.to_list())
    df = pd.DataFrame(
        [e.rsplit(" (", 1) for e in s.split("; ")], columns=["name", "freq"]
        )  # parenthesis in names
    df.loc[:, "freq"] = df["freq"].str.replace(")", "", regex=False).astype(float)
    # Aggreagte, normalize, and sort
    tot = df["freq"].sum()
    df = df.groupby("name").apply(lambda x: x.sum() / tot)
    df = df.sort_values(by=["freq"], ascending=False)
    # Format as string
    uniq = df.to_dict()["freq"]
    uniq = [f"{name} ({round(freq, 2)})" for name, freq in uniq.items()]
    return "; ".join(uniq)


def main(compo, report, sample):
    df = pd.read_csv(compo, sep="\t", header=0).fillna(0)

    # Empty input case
    if len(df["Query"]) == 1 and df["Query"].head(1).item() == "-":
        with open(report, "w") as fout:
            fout.write(
                "Sample\tConsensus\tRank\tTaxid\tCount\tDisambiguation\tPercent of total\tPercent of assigned"
            )

    else:
        groups = df.groupby(["Consensus", "Rank", "Taxid"]).agg(
            {"Count": "sum", "Disambiguation": concatenate_uniq}
        )
        groups = groups.sort_values("Count", ascending=False).reset_index()

        # Get percs of total
        groups["perc"] = round(groups["Count"] / groups["Count"].sum() * 100, 2)

        # Get percs of assigned
        assigned, notassigned = (
            groups[groups["Consensus"] != "-"],
            groups[groups["Consensus"] == "-"],
        )
        assigned["perc_ass"] = round(assigned["Count"] / assigned["Count"].sum() * 100, 2)
        notassigned["perc_ass"] = "-"
        groups = pd.concat([assigned, notassigned])

        # Formatting
        groups.insert(0, "Sample", sample)
        groups.rename(columns={"perc": "Percent of total",
                               "perc_ass": "Percent of assigned"},
                      inplace=True)
        groups["Consensus"].replace({"-": "No match"}, inplace=True)
        groups["Taxid"].replace({0: "-"}, inplace=True)
        groups.to_csv(report, sep="\t", index=False)


if __name__ == '__main__':
    main(snakemake.input['compo'],
         snakemake.output['report'],
         snakemake.params['sample_name'])
21
22
23
24
25
26
27
28
29
30
31
32
33
# logging
log = file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type = "message")

knitr::opts_chunk$set(out.width = '80%',fig.asp= 0.5,fig.align='center',echo=FALSE, warning=FALSE, message=FALSE)
options(markdown.HTML.header = system.file("misc", "datatables.html", package = "knitr"))

library(DT, quietly = T)
library(tidyverse, quietly = T)
library(htmltools, quietly = T)

executor <- Sys.info()["user"]
41
42
43
44
45
46
47
48
49
htmltools::a(
    href="https://cvua-rrw.github.io/FooDMe/",
    htmltools::img(
        src = knitr::image_uri(snakemake@params[['logo']]), 
        alt = 'FooDMe documentation', 
        style = 'position:absolute; top:0; right:0; padding:10px;',
        width=200
    )
)
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
workdir <- snakemake@params[["workdir"]]

overview <- snakemake@input[["summary"]]
fastp <- snakemake@input[["fastp"]]
qc_filtering <- snakemake@input[["qc_filtering"]]
clustering <- snakemake@input[["clustering"]]
blast_rep <- snakemake@input[["blast_rep"]]
taxonomy <- snakemake@input[["taxonomy"]]
result <- snakemake@input[["result"]]
db <- snakemake@input[["db"]]
soft <- snakemake@input[["soft"]]

OTU_bool <- snakemake@params[["method"]] == "otu" # store True if using OTU 

# infer run name from workdir
run <- basename(workdir)
#head(tail(strsplit(workdir,"/")[[1]],2),1)

reportAll <- snakemake@params[["sample"]] == "all"

# Number of samples
nsamples <- nrow(read.csv(file = overview, sep = "\t", check.names=FALSE))
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
data_table <- read.csv(file = overview, sep = "\t", check.names=FALSE)
datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						)
						))
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
data_table <- read.csv(file = fastp, sep = "\t", check.names=FALSE)

# Create hyperlinks
data_table$links <- paste0("<a href=", data_table$link_to_report, ">file</a>")
data_table$link_to_report = NULL

datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						)
						))
156
cat("## Read filtering statistics\n")
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
data_table <- read.csv(file = qc_filtering, sep = "\t", check.names=FALSE)
datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						)
						))
183
cat("## Clustering statistics\n")
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
data_table <- read.csv(file = clustering, sep = "\t", check.names=FALSE)
datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						)
						))
210
cat("## Denoising statistics\n")
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
data_table <- read.csv(file = clustering, sep = "\t", check.names=FALSE)
datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						)
						))
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
data_table <- read.csv(file = blast_rep, sep = "\t", check.names=FALSE)
#Process links
data_table$blast_report <- paste0("<a href=", data_table$link_report, ">file</a>")
data_table$link_report = NULL
data_table$filtered_report <- paste0("<a href=", data_table$link_filtered, ">file</a>")
data_table$link_filtered = NULL

datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						),
						deferRender = TRUE,
						scroller = TRUE
						))
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
data_table <- read.csv(file = taxonomy, sep = "\t", check.names=FALSE)
datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						)
						))
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
data_table <- read.csv(file = result, sep = "\t", check.names=FALSE)
datatable(data_table, filter = 'top', rownames = FALSE, escape = FALSE,
		extensions = list("ColReorder" = NULL, "Buttons" = NULL),
		options = list(	
					dom = 'BRrltpi',
					autoWidth=FALSE,
					scrollX = TRUE,
					lengthMenu = list(c(10, 50, -1), c('10', '50', 'All')),
					ColReorder = TRUE,
					buttons =
					list(
						'copy',
						'print',
						list(
						extend = 'collection',
						buttons = c('csv', 'excel', 'pdf'),
						text = 'Download'
						),
						I('colvis')
						),
						deferRender = TRUE,
						scroller = TRUE
						))
327
328
329
330
331
332
333
if (snakemake@params[["sample"]] == "all") {
	krona_source <- "krona_chart.html"
} else {
	krona_source <- paste0(snakemake@params[["sample"]], "_krona_chart.html")
}

htmltools::tags$iframe(title = "Krona chart", src = krona_source, width ="100%", height="800px") 
343
344
db_table <- read.csv(file = db, sep = "\t", check.names=FALSE)
knitr::kable(db_table)
350
351
soft_table <- read.csv(file = soft, sep = "\t", check.names=FALSE)
knitr::kable(soft_table)
ShowHide 49 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://cvua-rrw.github.io/FooDMe
Name: foodme
Version: 1.6.6
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: BSD 3-Clause "New" or "Revised" License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...