Developing Read Abundance rule

public public 1yr ago 0 bookmarks

( PH age IDE ntification PIPE line R epository)

This repository contains a snakemake workflow that utilizes state-of-the-art tools to identify free-phages and prophages in metagenomic sequences (viral enriched and whole-community).

Code Snippets

60
61
62
63
64
65
shell:
    """
    # symlink input paths to renamed files
    ln -s {input.R1} {output.R1}
    ln -s {input.R2} {output.R2}
    """
 95
 96
 97
 98
 99
100
shell:
    """
    # symlink input paths to renamed files
    ln -s {input.R1} {output.R1}
    ln -s {input.R2} {output.R2}
    """
118
119
120
121
122
shell:
    """
    gunzip -c {input.R1} > {output.R1}
    gunzip -c {input.R2} > {output.R2}
    """
147
148
149
150
151
152
153
154
155
156
157
158
shell:
    """
    # run clumpify
    clumpify.sh \
    in={input.R1} \
    in2={input.R2} \
    out={output.R1} \
    out2={output.R2} \
    {params.extra_args} > {output.log} 2>&1

    cp {output.log} {log}
    """
177
178
179
180
181
shell:
    """
    # download human genome reference to desired directory
    kneaddata_database --download human_genome bowtie2 {params.kneaddata_db}
    """
217
218
219
220
221
222
223
224
225
226
227
228
shell:
    """
    # run kneaddata to quality filter and remove host reads
    kneaddata --input {input.R1} --input {input.R2} \
    --output {params.output_dir} \
    --output-prefix {params.prefix} \
    --reference-db {params.human_db} \
    --threads {threads} \
    {params.extra_args}

    cp {output.log} {log}
    """
262
263
264
265
266
267
268
shell:
    """
    # generate read counts from kneaddata log files
    kneaddata_read_count_table \
    --input {params.log_dir} \
    --output {output}
    """
68
69
70
71
72
73
74
75
shell:
    """
    # combine reads for coassembly
    cat {input.R1} > {output.R1}
    cat {input.R2} > {output.R2}
    cat {input.R1S} > {output.R1S}
    cat {input.R2S} > {output.R2S}
    """
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
shell:
    """
    # assemble reads using metaspades
    spades.py \
    --meta \
    --pe1-1 {input.R1} \
    --pe1-2 {input.R2} \
    --pe1-s {input.R1S} \
    --pe1-s {input.R2S} \
    -o {params.output_dir} \
    --threads {threads} \
    {params.extra_args}

    # copy spades.log to log file
    cp {params.output_dir}/spades.log {log}
    """
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
shell:
    """
    # assembly analysis using quast
    metaquast.py \
    {input} \
    -o {params.output_dir} \
    --threads {threads} \
    --min-contig {params.min_len} \
    --contig-thresholds 0,1000,5000,10000,{params.min_len} \
    --labels {params.labels} \
    {params.extra_args}

    # copy spades.log to log file
    cp {params.output_dir}/quast.log {log}
    """
204
205
206
207
208
shell:
    """
    # combine quast reports for all assemblies, only keeping the header from one file
    awk 'FNR>1 || NR==1' {input} > {output}
    """
54
55
56
57
shell:
    """
    cat {input} > {output}
    """
82
83
84
85
86
87
88
89
90
91
92
93
94
95
shell:
    """
    # make a blast db from phage contigs
    makeblastdb -in {input} -out {params.blastdb} -dbtype nucl

    # all against all blast
    blastn -query {input} -db {params.blastdb} -out {params.blast_tsv} -num_threads {threads} -outfmt '6 std qlen slen' -max_target_seqs 25000 -perc_identity 90

    # calculate ani and af from blast results
    python {params.blastani_script} -i {params.blast_tsv} -o {params.ani_tsv}

    # cluster phage genomes based on 95% ani and 85% af
    python {params.cluster_script} --fna {input} --ani {params.ani_tsv} --out {output} --min_ani {params.min_ani} --min_qcov {params.min_qcov} --min_tcov {params.min_tcov}
    """
133
134
135
136
137
138
139
140
141
142
143
144
145
146
shell:
    """
    # make a blast db from phage contigs
    makeblastdb -in {input} -out {params.blastdb} -dbtype nucl

    # all against all blast
    blastn -query {input} -db {params.blastdb} -out {params.blast_tsv} -num_threads {threads} -outfmt '6 std qlen slen' -max_target_seqs 25000 -perc_identity 90

    # calculate ani and af from blast results
    python {params.blastani_script} -i {params.blast_tsv} -o {params.ani_tsv}

    # cluster phage genomes based on 95% ani and 85% af
    python {params.cluster_script} --fna {input} --ani {params.ani_tsv} --out {output} --min_ani {params.min_ani} --min_qcov {params.min_qcov} --min_tcov {params.min_tcov}
    """
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
shell:
    """
    # make a bowtie2 db from virusdb
    bowtie2-build {input.virusdb} {params.db} --threads {threads}

    # align reads to bowtie2 database
    bowtie2 \
    --threads {threads} \
    -x {params.db} \
    -1 {input.R1} \
    -2 {input.R2} \
    -U {input.R1S},{input.R2S} \
    -S {params.sam}

    # convert sam to bam
    samtools view -S -b {params.sam} > {output}
    rm -rf {params.sam}
    """
199
200
201
202
203
204
205
shell:
    """
    # build metapop
    pip install metapop

    touch {output}
    """
228
229
230
231
232
233
234
235
236
shell:
    """
    # run metapop to identify viruses present in samples
    metapop --input_samples {params.bam_dir} \
    --reference {input.virusdb} \
    --genes /home/carsonjm/resources/virusdb/proteins/all_genomes_genes.fasta \
    --output {params.out_dir} \
    --no_micro --no_macro
    """
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
shell:
    """
    # concatenate results
    cat {input.viruses} {input.virusdb} > {output.combined}

    # make a blast db from phage contigs
    makeblastdb -in {output.combined} -out {params.blastdb} -dbtype nucl

    # all against all blast
    blastn -query {output.combined} -db {params.blastdb} -out {params.blast_tsv} -num_threads {threads} -outfmt '6 std qlen slen' -max_target_seqs 25000 -perc_identity 90

    # calculate ani and af from blast results
    python {params.blastani_script} -i {params.blast_tsv} -o {params.ani_tsv}

    # cluster phage genomes based on 95% ani and 85% af
    python {params.cluster_script} --fna {output.combined} --ani {params.ani_tsv} --out {output.clusters} --min_ani {params.min_ani} --min_qcov {params.min_qcov} --min_tcov {params.min_tcov}
    """
47
48
49
50
51
52
53
54
shell:
    """
    # make blastdb
    makeblastdb \
    -in {input} \
    -out {params.spacers_blastdb} \
    -dbtype nucl
    """
70
71
72
73
74
75
76
77
78
79
80
81
shell:
    """
    # blast viruses against uhgg spacers
    blastn \
    -query {input.viruses} \
    -db {params.spacers_blastdb} \
    -out {output} \
    -outfmt '6 std qlen slen' \
    -dust no \
    -word_size 18 \
    -num_threads {threads}
    """
114
115
116
117
118
119
120
121
122
123
124
125
126
127
shell:
    """
    # git clone phist
    rm -rf {params.phist_dir}
    git clone --recurse-submodules https://github.com/refresh-bio/PHIST {params.phist_dir}

    # build phist
    cd {params.phist_dir}
    make
    mkdir ./out

    # test phist
    python3 phist.py ./example/virus ./example/host ./out/common_kmers.csv ./out/predictions.csv
    """
158
159
160
161
162
163
shell:
    """
    # run phist using uhgg
    python3 {params.phist_script} {params.virus_dir} {params.bacteria_db_dir} {output.table} {output.predictions} \
    --t {threads}
    """
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
shell:
    """
    # download rafah files
    wget -P {params.rafah_dir} https://sourceforge.net/projects/rafah/files/RaFAH_v0.3_Files/README.md
    wget -P {params.rafah_dir} https://sourceforge.net/projects/rafah/files/RaFAH_v0.3_Files/RaFAH_v0.3_hmm_models.tgz
    wget -P {params.rafah_dir} https://sourceforge.net/projects/rafah/files/RaFAH_v0.3_Files/RaFAH_v0.3_Ranger_Model.tgz
    wget -P {params.rafah_dir} https://sourceforge.net/projects/rafah/files/RaFAH_v0.3_Files/RaFAH.pl
    wget -P {params.rafah_dir} https://sourceforge.net/projects/rafah/files/RaFAH_v0.3_Files/HP_Ranger_Model_3_Valid_Cols.txt
    wget -P {params.rafah_dir} https://sourceforge.net/projects/rafah/files/RaFAH_v0.3_Files/RaFAH_Predict_Host.R
    wget -P {params.rafah_dir} https://sourceforge.net/projects/rafah/files/RaFAH_v0.3_Files/RaFAH_Train_Model.R

    # decompress models
    cd {params.rafah_dir}
    tar -zxvf RaFAH_v0.3_hmm_models.tgz
    tar -zxvf RaFAH_v0.3_Ranger_Model.tgz
    """
239
240
241
242
243
244
245
246
247
248
249
250
251
252
shell:
    """
    cd {params.out_dir}

    perl {input.rafah} --predict \
    --genomes_dir {params.virus_dir} \
    --extension .fna \
    --valid_ogs_file {input.valid_cols} \
    --genomexog_table_file_name /home/carsonjm/CarsonJM/phide_piper/results/07_VIRUS_HOST/03_rafah/RaFAH_Genome_to_OGs_Score_Min_Score_50-Max_evalue_1e-05_Prediction.tsv \
    --hmmer_db_file_name {params.hmm} \
    --r_script_predict_file_name {input.predict_script} \
    --r_model_file_name {input.filename} \
    --threads {threads}
    """
46
47
48
49
50
51
shell:
    """
    # symlink input paths to renamed files
    ln -s {input.R1} {output.R1}
    ln -s {input.R2} {output.R2}
    """
70
71
72
73
74
75
shell:
    """
    # make a bowtie2 db from virusdb
    bowtie2-build {input} {params.db} --threads {threads}
    {params.extra_args}
    """
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
shell:
    """
    # align reads to bowtie2 database
    bowtie2 \
    --threads {threads} \
    -x {params.db} \
    -1 {input.R1} \
    -2 {input.R2} \
    -S {params.sam} \
    > {log} 2>&1 \
    {params.extra_args}

    # convert sam to bam
    samtools view -S -b {params.sam} > {output} 
    rm {params.sam}
    """
152
153
154
155
156
157
158
shell:
    """
    kraken2-build --download-taxonomy --db {params.db}
    kraken2-build --add-to-library {input} --db {params.db}
    kraken2-build --build --db {params.db}
    {params.extra_args}
    """
180
181
182
183
184
185
186
shell:
    """
    kraken2 --paired {input.R1} {input.R2} \
    --db {params.db} \
    --report {output.report} > {output.classification} \
    {params.extra_args}
    """
203
204
205
206
207
shell:
    """
    bracken-build -d {params.db} -t {threads} -k 35 -l 150 \
    {params.extra_args}
    """
226
227
228
229
230
231
232
233
234
235
shell:
    """
    bracken -d {params.db} \
    -i {input.report} \
    -o {output.abundance} \
    -r 150 \
    -l {params.level} \
    -t {threads} \
    {params.extra_args}
    """
261
262
263
264
shell: 
    """
    echo -e {wildcards.assembly_sample}”\t”$(samtools view -c {input} > {output})
    """
273
274
275
276
shell:
    """
    cat {input} > {output}
    """
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
shell:
    """
    pip install metapop

    # run metapop to identify viruses present in samples
    metapop --input_samples {params.bam_dir} \
    --norm {input.read_counts} \
    --reference {params.viruses_dir} \
    --output {params.out_dir} \
    --min_cov {params.min_breadth} \
    --minimum_bases_for_detection {params.min_length} \
    --min_dep {params.min_depth} \
    --no_micro \
    --threads {threads} \
    {params.extra_args}
    """
331
332
333
334
335
shell:
    """
    prodigal -i {input} \
    -o {output}
    """
344
345
346
347
shell:
    """
    samtools sort {input.bam} -o {output.sort}
    """
364
365
366
367
368
369
370
371
372
373
374
shell:
    """
    inStrain profile \
    {input.bam} \
    {input.fasta} \
    -o {params.out_dir} \
    --skip_mm_profiling \
    --skip_genome_wide \
    --min_cov {params.min_breadth} \
    {params.extra_args}
    """
394
395
396
397
398
399
400
401
shell:
    """
    inStrain compare \
    -i {params.profile} \
    -o {params.out_dir} \
    -bams {params.bam} \
    {params.extra_args}
    """
ShowHide 29 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/margotl9/phide_piper_dev
Name: phide_piper_dev
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...