A Snakemake workflow for ecDNA detection in Nanopore or Illumina sequencing reads derived from DNA samples enriched for circular DNA.

public public 1yr ago Version: v2.0.0 0 bookmarks

A Snakemake workflow for ecDNA detection in Nanopore or Illumina sequencing reads derived from DNA samples enriched for circular DNA.

Usage

The usage of this workflow is described in the Snakemake Workflow Catalog .

If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository.

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False, parse_memory=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


shell("bcftools concat {bcftools_opts} {extra} {snakemake.input.calls} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(
    snakemake, parse_ref=False, parse_output_format=False, parse_memory=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


if "--tbi" in extra or "--csi" in extra:
    raise ValueError(
        "You have specified index format (`--tbi/--csi`) in `params.extra`; this is automatically infered from the first output file."
    )

if snakemake.output[0].endswith(".tbi"):
    extra += " --tbi"
elif snakemake.output[0].endswith(".csi"):
    extra += " --csi"
else:
    raise ValueError("invalid index file format ('.tbi', '.csi').")


shell("bcftools index {bcftools_opts} {extra} {snakemake.input[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
__author__ = "Filipe G. Vieira"
__copyright__ = "Copyright 2020, Filipe G. Vieira"
__license__ = "MIT"


import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "bcftools sort"
        " {bcftools_opts}"
        " {extra}"
        " --temp-dir {tmpdir}"
        " {snakemake.input[0]}"
        " {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
__author__ = "Tom Poorten"
__copyright__ = "Copyright 2017, Tom Poorten"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import infer_out_format
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake, parse_output=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sorting", "none")
sort_extra = snakemake.params.get("sort_extra", "")

out_ext = infer_out_format(snakemake.output[0])

pipe_cmd = ""
if out_ext != "PAF":
    # Add option for SAM output
    extra += " -a"

    # Determine which pipe command to use for converting to bam or sorting.
    if sort == "none":
        if out_ext != "SAM":
            # Simply convert to output format using samtools view.
            pipe_cmd = f"| samtools view -h {samtools_opts}"

    elif sort in ["coordinate", "queryname"]:
        # Add name flag if needed.
        if sort == "queryname":
            sort_extra += " -n"

        # Sort alignments.
        pipe_cmd = f"| samtools sort {sort_extra} {samtools_opts}"

    else:
        raise ValueError(f"Unexpected value for params.sort: {sort}")


shell(
    "(minimap2"
    " -t {snakemake.threads}"
    " {extra} "
    " {snakemake.input.target}"
    " {snakemake.input.query}"
    " {pipe_cmd}"
    " > {snakemake.output[0]}"
    ") {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
__author__ = "Tom Poorten"
__copyright__ = "Copyright 2017, Tom Poorten"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "(minimap2 -t {snakemake.threads} {extra} "
    "-d {snakemake.output[0]} {snakemake.input.target}) {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess
import sys
from pathlib import Path
from snakemake.shell import shell


log = snakemake.log_fmt_shell(stdout=False, stderr=True)


species = snakemake.params.species.lower()
build = snakemake.params.build
release = int(snakemake.params.release)
out_fmt = Path(snakemake.output[0]).suffixes
out_gz = (out_fmt.pop() and True) if out_fmt[-1] == ".gz" else False
out_fmt = out_fmt.pop().lstrip(".")


branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"


flavor = snakemake.params.get("flavor", "")
if flavor:
    flavor += "."


suffix = ""
if out_fmt == "gtf":
    suffix = "gtf.gz"
elif out_fmt == "gff3":
    suffix = "gff3.gz"
else:
    raise ValueError(
        "invalid format specified. Only 'gtf[.gz]' and 'gff3[.gz]' are currently supported."
    )


url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
    release=release,
    build=build,
    species=species,
    out_fmt=out_fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)


try:
    if out_gz:
        shell("curl -L {url} > {snakemake.output[0]} {log}")
    else:
        shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
__author__ = "Michael Chambers"
__copyright__ = "Copyright 2019, Michael Chambers"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_threads=False, parse_write_index=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("samtools faidx {samtools_opts} {extra} {snakemake.input[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)
19
20
shell:
    """cyrcular graph table {input.graph} {input.bcf} --reference {input.reference} --circle-table {output.overview} --segment-tables {output.details} 2> {log}"""
45
46
47
48
49
50
51
52
shell:
    "cyrcular graph annotate "
    "  --reference {input.reference} "
    "  --gene-annotation {input.gene_annotation} "
    "  --regulatory-annotation {input.regulatory_annotation} "
    "  --output {output.annotated} "
    "  {input.graph} "
    "2> {log} "
85
86
script:
    "../scripts/sort_bcf_header.py"
 98
 99
100
101
shell:
    """
    bcftools view -h {input.bcf} > {output.header} 2> {log}
    """
115
116
117
118
shell:
    """
    bcftools view -h {input.vcf} | rg {params.fields:q} > {output.header} 2> {log}
    """
12
13
wrapper:
    "v1.25.0/bio/bcftools/index"
SnakeMake From line 12 of rules/call.smk
34
35
wrapper:
    "v1.25.0/bio/bcftools/concat"
SnakeMake From line 34 of rules/call.smk
47
48
wrapper:
    "v1.25.0/bio/bcftools/sort"
SnakeMake From line 47 of rules/call.smk
66
67
68
69
shell:
    "varlociraptor "
    "call variants generic --obs {params.obs} "
    "--scenario {input.scenario} > {output} 2> {log}"
83
84
shell:
    "varlociraptor estimate alignment-properties {input.ref} --bam {input.bam} > {output} 2> {log}"
108
109
110
111
112
113
114
shell:
    "varlociraptor preprocess variants {input.ref} "
    "--candidates {input.candidates} "
    "--max-depth 200 "
    "--alignment-properties {input.alignment_props} "
    "--pairhmm-mode {params.mode} "
    "--bam {input.bam} --output {output} 2> {log}"
128
129
shell:
    "rbt vcf-split {input} {output}"
141
142
wrapper:
    "v1.25.0/bio/bcftools/sort"
SnakeMake From line 141 of rules/call.smk
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
shell:
    """cyrcular\
    graph\
    breakends\
    {input.bam}\
    --reference {input.ref}\
    --min-read-depth {params.min_read_depth}\
    --min-split-reads {params.min_split_reads}\
    --max-paths-per-component {params.max_paths_per_component}\
    --max-deletion-length {params.max_deletion_length}\
    -t {threads}\
    --output {output.bnds}\
    --graph {output.graph}\
    --dot {output.dot}\
    2> {log}"""
SnakeMake From line 167 of rules/call.smk
61
62
script:
    "../scripts/copy_qc_plots.py"
80
81
script:
    "../scripts/copy_graph_plots.py"
103
104
shell:
    "datavzrd {input.config} --output {output} &> {log}"
15
16
script:
    "../scripts/filter_overview_table.py"
34
35
36
37
shell:
    """
    varlociraptor filter-calls control-fdr --mode {params.mode} --events PRESENT --var BND --fdr {params.fdr} {input.calls} | varlociraptor decode-phred | bcftools sort -m 4G -Ob > {output.fdr_calls} 2> {log}
    """
16
17
wrapper:
    "v1.25.0/bio/minimap2/aligner"
SnakeMake From line 16 of rules/map.smk
35
36
shell:
    """{params.cmd} {input.fastqs} | pigz -c > {output} 2> {log}"""
SnakeMake From line 35 of rules/map.smk
52
53
wrapper:
    "v1.25.0/bio/samtools/index"
SnakeMake From line 52 of rules/map.smk
69
70
wrapper:
    "v1.25.0/bio/samtools/faidx"
SnakeMake From line 69 of rules/map.smk
18
19
20
21
22
23
24
25
shell:
    """cyrcular\
    graph\
    plot\
    {input.bam}\
    --graph {input.graph}\
    --output {output.plots}\
    2> {log}"""
SnakeMake From line 18 of rules/plot.smk
38
39
40
41
42
43
shell:
    """
    mkdir -p {output.pdf_dir}
    count=`ls -1 {input.graph}/ 2>{log} | wc -l`
    for f in {input.graph}/*.dot; do (dot $f -Tpdf > "{output.pdf_dir}/graph_$(basename ${{f}} .dot).pdf" 2>>{log}); done
    """
SnakeMake From line 38 of rules/plot.smk
17
18
wrapper:
    "v1.25.0/bio/reference/ensembl-sequence"
SnakeMake From line 17 of rules/ref.smk
32
33
wrapper:
    "v1.25.0/bio/samtools/faidx"
SnakeMake From line 32 of rules/ref.smk
56
57
wrapper:
    "v1.25.0/bio/minimap2/index"
SnakeMake From line 56 of rules/ref.smk
73
74
shell:
    """wget https://ftp.ensembl.org/pub/release-{params.release}/regulation/homo_sapiens/homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20220201.gff.gz --no-check-certificate -O {output} 2> {log}"""
SnakeMake From line 73 of rules/ref.smk
89
90
shell:
    """wget {params.download_link} --no-check-certificate -O {output} 2> {log}"""
SnakeMake From line 89 of rules/ref.smk
105
106
wrapper:
    "v1.25.0/bio/reference/ensembl-annotation"
SnakeMake From line 105 of rules/ref.smk
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
from contextlib import redirect_stderr

with open(snakemake.log[0], "w") as logfile:
    with redirect_stderr(logfile):
        import os
        import shutil
        import pandas as pd
        from pathlib import Path

        os.makedirs(snakemake.params.output_dir, exist_ok=True)
        overview = pd.read_csv(snakemake.input.overview, sep="\t")
        graph_ids = set(overview["graph_id"])
        for graph_id in graph_ids:
            shutil.copy(
                f"{snakemake.input.plots}/graph_{graph_id}.pdf", snakemake.params.output_dir
            )
        Path(snakemake.output.marker).touch()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
from contextlib import redirect_stderr

with open(snakemake.log[0], "w") as logfile:
    with redirect_stderr(logfile):
        import os
        import shutil
        import pandas as pd
        from pathlib import Path

        os.makedirs(snakemake.params.output_dir, exist_ok=True)
        overview = pd.read_csv(snakemake.input.overview, sep="\t")
        event_ids = {s.replace("-", "_") for s in overview["event_id"]}
        for event_id in event_ids:
            shutil.copy(
                f"{snakemake.input.plots}/graph_{event_id}.html",
                snakemake.params.output_dir,
            )
        Path(snakemake.output.marker).touch()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from contextlib import redirect_stderr

with open(snakemake.log[0], "w") as logfile:
    with redirect_stderr(logfile):
        import pandas as pd

        df = pd.read_csv(snakemake.input.table, sep="\t")
        df.loc[:, ["gene_names", "gene_ids", "regulatory_features"]] = df.loc[
            :, ["gene_names", "gene_ids", "regulatory_features"]
        ].fillna("")
        df["category"] = df[["num_exons", "regulatory_features", "gene_names"]].apply(
            lambda r: "coding"
            if r["num_exons"] > 0
            else (
                "regulatory"
                if r["regulatory_features"]
                else ("intronic" if r["gene_names"] else "other")
            ),
            axis=1,
        )
        for kind in ["coding", "regulatory", "intronic", "other"]:
            part = df.query(f"category == '{kind}'")
            part.to_csv(getattr(snakemake.output, kind), sep="\t", index=False)
        df.to_csv(snakemake.output.categorized, sep="\t", index=False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
with open(snakemake.input.header, "rt") as header_file:
    header = [l.strip() for l in header_file.readlines()]
    file_format_line = header[0]
    chrom_line = header[-1]
    other_lines = header[1:-1]
    kinds = [
        "fileDate",
        "source",
        "reference",
        "contig",
        "phasing",
        "FILTER",
        "INFO",
        "FORMAT",
        "ALT",
        "assembly",
        "META",
        "SAMPLE",
        "PEDIGREE",
        "pedigreeDB",
    ]
    categories = {kind: [] for kind in kinds}
    others = []
    for line in other_lines:
        if "=" in line:
            kind = line.split("=")[0].lstrip("#")
            group = categories.get(kind, others)
        else:
            group = others
        group.append(line)

    with open(snakemake.output.sorted_header, "wt") as out:
        print(file_format_line, file=out)
        for kind in kinds:
            lines = categories[kind]
            for line in lines:
                print(line, file=out)
        for line in others:
            print(line, file=out)
        print(chrom_line, file=out)
ShowHide 39 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://snakemake.github.io/snakemake-workflow-catalog/?usage=snakemake-workflows/cyrcular-calling
Name: cyrcular-calling
Version: v2.0.0
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...