Snakemake pipelines for nanopore sequencing data archiving and processing

public 1yr ago Version: v1.2.0 0 bookmarks

View Workflow

Documentation

Detailed documentation and tutorials are available at readthedocs .

Nanopype is a snakemake based pipeline providing convenient Oxford Nanopore Technology (ONT) data processing and storage solutions.

The pipeline is split in a processing part including basecalling and alignment and an analysis part covering further downstream applications. A summary of all included tools is given in the tools section.

To get started the installation chapter describes the available installation options depending on the operation system, available hardware and already existing environments.

Recurring steps of the nanopore data analysis are covered under workflow for both local and cluster usage.

The modules part covers an in depth description of all available tools and workflows together with their respective configuration options. This section is the main reference of the pipeline.

Finally for new users the tutorial might be helpful to learn the general concepts and usage of the pipeline. To complete the tutorial the test reads included in the package are sufficient and no separate wet-lab experiment is required.

Code Snippets

shell:
    """
    {config[bin_singularity][minimap2]} -t {threads} {config[alignment_minimap2_flags]} {input.reference} {input.sequence} 1>> {output} 2> >(tee {output}.log >&2)
    if [ $(grep 'ERROR' {output}.log | wc -l) -gt 0 ]; then exit 1; else rm {output}.log; fi
    """

SnakeMake Minimap2 From line 79 of rules/alignment.smk

shell:
    """
    {config[bin_singularity][graphmap2]} align -r {input.reference} -d {input.sequence} -t {threads} {config[alignment_graphmap2_flags]} >> {output}
    """

SnakeMake graphmap2 From line 101 of rules/alignment.smk

shell:
    """
    {config[bin_singularity][graphmap2]} align -r {input.fasta} --index-only
    """

SnakeMake graphmap2 From line 114 of rules/alignment.smk

shell:
    """
    cat {input.sequence} | {config[bin_singularity][ngmlr]} -r {input.reference} -t {threads} {config[alignment_ngmlr_flags]} >> {output}
    """

SnakeMake NGMLR From line 137 of rules/alignment.smk

shell:
    """
    echo '' | {config[bin_singularity][ngmlr]} -r {input.fasta}
    touch {output.index}
    """

SnakeMake NGMLR From line 150 of rules/alignment.smk

shell:
    """
    {config[bin_singularity][samtools]} faidx {input.fasta}
    """

SnakeMake SAMtools From line 169 of rules/alignment.smk

shell:
    """
    cat {input.sam} | perl -anle 'BEGIN{{$header=1}}; if($header == 1){{ if($_ =~ /^@/) {{print $_}} else {{$header=0; print "\@RG\tID:{wildcards.batch}"}}}} else {{print $_}}' | perl -anle 'if($_ =~ /^@/){{print $_}}else{{print join("\t", @F, "RG:Z:{wildcards.batch}")}}' |  {config[bin_singularity][samtools]} view -b {config[alignment_samtools_flags]} - | {config[bin_singularity][samtools]} sort -m 4G > {output.bam}
    {config[bin_singularity][samtools]} index {output.bam}
    """

SnakeMake SAMtools From line 189 of rules/alignment.smk

run:
    with open(output.txt, 'w') as fp:
        print('\n'.join(input.bam), file=fp)

SnakeMake From line 200 of rules/alignment.smk

shell:
    """
    split -l $((`ulimit -n` -10)) {input.bam} {params.input_prefix}.part_
    for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done
    for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done
    {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads}
    {config[bin_singularity][samtools]} index {output.bam} -@ {threads}
    rm {params.input_prefix}.part_*
    """

SnakeMake SAMtools From line 218 of rules/alignment.smk

run:
    with open(output.txt, 'w') as fp_out:
        for f in input.txt:
            with open(f, 'r') as fp_in:
                fp_out.write(fp_in.read())

SnakeMake From line 238 of rules/alignment.smk

shell:
    """
    split -l $((`ulimit -n` -10)) {input.bam} {params.input_prefix}.part_
    for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done
    for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done
    {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads}
    {config[bin_singularity][samtools]} index {output.bam} -@ {threads}
    rm {params.input_prefix}.part_*
    """

SnakeMake SAMtools From line 257 of rules/alignment.smk

shell:
    """
    {config[bin_singularity][samtools]} view -F 4 {input} | {config[bin_singularity][python]} {config[sbin_singularity][alignment_1D2.py]} --buffer {params.buffer} --tolerance {params.tolerance} > {output}
    """

SnakeMake SAMtools From line 280 of rules/alignment.smk

shell:
    """
    while IFS= read -r bam_file; do {config[bin_singularity][samtools]} view ${{bam_file}}; done < {input} | {config[bin_singularity][python]} {config[sbin_singularity][alignment_stats.py]} {output}
    """

SnakeMake SAMtools From line 296 of rules/alignment.smk

shell:
    """
    while IFS= read -r bam_file; do {config[bin_singularity][samtools]} view -bF 4 ${{bam_file}} -@ {threads} | {config[bin_singularity][bedtools]} bamtobed -i stdin; done < {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][alignment_cov.py]} {input.chr_sizes} > {output.bedGraph}
    {config[bin_singularity][bedGraphToBigWig]} {output.bedGraph} {input.chr_sizes} {output.bw}
    """

SnakeMake SAMtools BEDTools bedGraphToBigWig From line 314 of rules/alignment.smk

shell:
    """
    flye_dir=`dirname {config[bin_singularity][python]}`
    PATH=$flye_dir:$PATH
    {config[bin_singularity][python]} {config[bin_singularity][flye]} {params.flye_flags} -g {params.genome_size} -t {threads} {params.flye_preset} {input.seq} -o {params.out_prefix}
    mv {params.out_prefix}/assembly.fasta {output.fa}
    """

SnakeMake Flye From line 58 of rules/asm.smk

shell:
    """
    mkdir -p {params.out_prefix}
    {config[bin_singularity][wtdbg2]} {params.wtdbg2_flags} -x {params.wtdbg2_preset} -g {params.genome_size} -t {threads} -fo {params.out_prefix}/dbg -i {input.seq}
    {config[bin_singularity][wtpoa-cns]} -t {threads} -i {params.out_prefix}/dbg.ctg.lay.gz -fo {params.out_prefix}/dbg.raw.fa
    {config[bin_singularity][minimap2]} -t {threads} -ax map-ont -r2k {params.out_prefix}/dbg.raw.fa {input.seq} | {config[bin_singularity][samtools]} sort -@ 4 > {params.out_prefix}/dbg.bam
    samtools view -F0x900 {params.out_prefix}/dbg.bam | {config[bin_singularity][wtpoa-cns]} -t {threads} -d {params.out_prefix}/dbg.raw.fa -i - -fo {params.out_prefix}/dbg.cns.fa
    mv {params.out_prefix}/dbg.cns.fa {output.fa}
    """

SnakeMake SAMtools Minimap2 wtdbg2 From line 83 of rules/asm.smk

shell:
    """
    mkdir -p raw
    {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format bulk
    {config[bin_singularity][guppy_basecaller]} -i raw/ --recursive --num_callers 1 --cpu_threads_per_caller {threads} -s workspace/ {params.guppy_config}  {params.filtering} {params.guppy_flags} {params.guppy_server}
    FASTQ_DIR='workspace/pass'
    if [ \'{params.filtering}\' = '' ]; then
        FASTQ_DIR='workspace'
    fi
    find ${{FASTQ_DIR}} -regextype posix-extended -regex '^.*f(ast)?q' -exec cat {{}} \; | gzip > {output[0]}
    find ${{FASTQ_DIR}} -name 'sequencing_summary.txt' -exec mv {{}} {output[1]} \;
    if [ \'{params.mod_table}\' != '' ]; then
        {config[bin_singularity][python]} {config[sbin_singularity][basecalling_guppy_mod.py]} extract `find workspace/ -name '*.fast5'` {params.mod_table}
    fi
    """

SnakeMake GUPPY From line 90 of rules/basecalling.smk

shell:
    """
    export OPENBLAS_NUM_THREADS=1
    mkdir -p raw
    {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format single
    find raw/ -regextype posix-extended -regex '^.*fast5' -type f -exec du -h {{}} + | sort -r -h | cut -f2 > raw.fofn
    split -e -n r/{threads} raw.fofn raw.fofn.part.
    ls raw.fofn.part.* | xargs -n 1 -P {threads} -I {{}} $SHELL -c 'cat {{}} | shuf | xargs -n 1 {config[bin_singularity][flappie]} --model {config[basecalling_flappie_model]} {config[basecalling_flappie_flags]} > raw/{{}}.fastq'
    find ./raw -regextype posix-extended -regex '^.*f(ast)?q' -exec cat {{}} \; > {wildcards.batch}.fq
    cat {wildcards.batch}.fq | {config[bin_singularity][python]} {config[sbin_singularity][methylation_flappie.py]} split methyl_marks.tsv | gzip > {output.sequence}
    cat methyl_marks.tsv | gzip > {output.methyl_marks}
    """

SnakeMake From line 124 of rules/basecalling.smk

run:
    with open(output[0], 'wb') as fp_out:
        for f in input:
            with open(f, 'rb') as fp_in:
                fp_out.write(fp_in.read())

SnakeMake From line 143 of rules/basecalling.smk

run:
    with open(output[0], 'wb') as fp_out:
        for f in input:
            with open(f, 'rb') as fp_in:
                fp_out.write(fp_in.read())

SnakeMake From line 154 of rules/basecalling.smk

run:
    import gzip
    import pandas as pd
    def fastq_iter(iterable):
        while True:
            try:
                title = next(iterable)
                assert title[0] == '@'
                seq = next(iterable)
                _ = next(iterable)
                qual = next(iterable)
            except StopIteration:
                return
            mean_q = sum([ord(x) - 33 for x in qual]) / len(qual) if qual else 0.0
            yield len(seq), mean_q
    line_iter = (line for f in input for line in gzip.open(f, 'rb').read().decode('utf-8').split('\n') if line)
    df = pd.DataFrame(fastq_iter(line_iter), columns=['length', 'quality'])
    df.to_hdf(output[0], 'stats')

SnakeMake Pandas From line 165 of rules/basecalling.smk

shell:
    "rm -r {input}"

SnakeMake From line 51 of rules/clean.smk

shell:
    "rm -r {input}"

SnakeMake From line 60 of rules/clean.smk

shell:
    "rm -r {input}"

SnakeMake From line 69 of rules/clean.smk

shell:
    "rm -r {input}"

SnakeMake From line 78 of rules/clean.smk

shell:
    "rm -r {input}"

SnakeMake From line 87 of rules/clean.smk

shell:
    "rm -r {input}"

SnakeMake From line 96 of rules/clean.smk

shell:
    "rm {input}"

SnakeMake From line 108 of rules/clean.smk

shell:
    """
    {config[bin_singularity][guppy_barcoder]} -i {params.seq_dir} -s {output.batches} -t {threads} -q {config[demux_batch_size]} --compress_fastq --{params.kits}
    """

SnakeMake GUPPY From line 120 of rules/demux.smk

run:
    import os, gzip, re
    import shutil
    import itertools
    barcodes = [f.name for f in os.scandir(input.batches) if f.is_dir()]
    def fastq_ID(iterable):
        fq_iter = itertools.islice(iterable, 0, None, 4)
        while True:
            try:
                header = next(fq_iter).decode('utf-8')
                yield header[1:].split()[0]
            except StopIteration:
                return
    def touch(fname, times=None):
        with open(fname, 'a'):
            os.utime(fname, times)
    for barcode in barcodes:
        os.makedirs(os.path.join(output.barcodes, barcode), exist_ok=True)
        batches = [f.name for f in os.scandir(os.path.join(input.batches, barcode)) if f.name.endswith('fastq.gz')]
        for batch in batches:
            # read IDs from fastq stream
            batch_file = os.path.join(input.batches, barcode, batch)
            IDs = []
            with gzip.open(batch_file, 'rb') as fp:
                IDs = [ID for ID in fastq_ID(fp)]
            # write IDs to batch file
            with open(os.path.join(output.barcodes, barcode, re.sub('fastq\.gz', 'txt', batch)), 'w') as fp:
                fp.write('\n'.join(IDs))
        # move fastq to sequences directory
        bc_map = (config['barcodes'][wildcards.runname] if wildcards.runname in config['barcodes'] else
                 config['barcodes']['__default__'] if '__default__' in config['barcodes'] else
                 {})
        bc_tags = [key for key, value in bc_map.items() if value == barcode]
        # move if one target tag
        if len(bc_tags) == 1:
            target_path = os.path.join('sequences', config['demux_seq_workflow'], 'batches',
                          (config['demux_seq_tag'] if 'demux_seq_tag' in config else 'default') + '.' + bc_tags[0],
                         wildcards.runname)
            shutil.move(os.path.join('demux', 'guppy', 'batches', wildcards.runname, barcode),
                        target_path)
            # touch sequences after creating batch ID files
            for f in os.scandir(target_path):
                if f.name.endswith('fastq.gz'):
                    touch(f.path)
        # copy if multiple tags per barcode
        elif len(bc_tags) > 1:
            for bc_tag in bc_tags:
                target_path = os.path.join('sequences', config['demux_seq_workflow'], 'batches',
                (config['demux_seq_tag'] if 'demux_seq_tag' in config else 'default') + '.' + bc_tags[0],
                wildcards.runname)
                if os.path.exists(target_path):
                    shutil.rmtree(target_path)
                shutil.copytree(os.path.join('demux', 'guppy', 'batches', wildcards.runname, barcode),
                                target_path)

SnakeMake From line 138 of rules/demux.smk

    shell : ""

#rule demux:
#    input:
#        "bin/deepbinner"

SnakeMake From line 38 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    wget -q -nc https://dl.google.com/go/go1.17.3.linux-amd64.tar.gz
    tar -xzf go1.17.3.linux-amd64.tar.gz
    rm go1.17.3.linux-amd64.tar.gz
    """

SnakeMake From line 132 of rules/install.smk

shell:
    """
    install_prefix=`pwd`
    mkdir -p src && cd src
    if [ ! -d squashfs-tools ]; then
        git clone https://github.com/plougher/squashfs-tools --branch 4.4 --depth=1 && cd squashfs-tools
    else
        cd squashfs-tools && git fetch --all --tags --prune && git checkout tags/4.4
    fi
    cd squashfs-tools
    make
    cp *squashfs $install_prefix/bin/
    """

SnakeMake From line 144 of rules/install.smk

shell:
    """
    install_prefix=`pwd`
    mkdir -p src/gocode
    export GOPATH=$(pwd)/src/gocode
    export GOROOT=$(pwd)/src/go
    export PATH=$(pwd)/$(dirname {input.go}):$PATH
    cd src
    if [ ! -d singularity ]; then
        git clone https://github.com/sylabs/singularity.git --branch v3.3.0 --depth=1 && cd singularity
    else
        cd singularity && git fetch --all --tags --prune && git checkout tags/v3.3.0
    fi
    ./mconfig --without-suid --prefix=$install_prefix --localstatedir=$install_prefix/
    make -C ./builddir
    make -C ./builddir install
    """

SnakeMake Singularity Hub From line 163 of rules/install.smk

shell:
    """
    install_prefix=`pwd`
    mkdir -p lib
    mkdir -p src && cd src
    if [ ! -d vbz_compression ]; then
        git clone https://github.com/nanoporetech/vbz_compression --recursive --branch v1.0.1 --depth=1 && cd vbz_compression
    else
        cd vbz_compression && git fetch --all --tags --prune && git checkout tags/v1.0.1
    fi
    rm -rf build
    mkdir build && cd build
    cmake -D CMAKE_BUILD_TYPE=Release -D ENABLE_CONAN=OFF -D ENABLE_PERF_TESTING=OFF -D ENABLE_PYTHON=OFF ..
    make
    cp bin/libvbz_hdf_plugin.so ${{install_prefix}}/{output.lib}
    """

SnakeMake From line 184 of rules/install.smk

shell:
    """
    wget -q -O {output} https://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig
    chmod 755 {output}
    """

SnakeMake oschmod bedGraphToBigWig From line 205 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d bedtools2 ]; then
        git clone https://github.com/arq5x/bedtools2 --branch v2.27.1 --depth=1 && cd bedtools2
    else
        cd bedtools2 && git fetch --all --tags --prune && git checkout tags/v2.27.1
    fi
    make clean && make
    cp bin/bedtools ../../{output.bin}
    """

SnakeMake BEDTools From line 215 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d htslib ]; then
        git clone https://github.com/samtools/htslib --branch 1.9 --depth=1 && cd htslib
    else
        cd htslib && git fetch --all --tags --prune && git checkout tags/1.9
    fi
    autoheader && autoconf && ./configure && make -j{threads}
    """

SnakeMake From line 231 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d samtools ]; then
        git clone https://github.com/samtools/samtools --branch 1.9 --depth=1 && cd samtools
    else
        cd samtools && git fetch --all --tags --prune && git checkout tags/1.9
    fi
    autoheader --warning=none && autoconf -Wno-syntax && ./configure && make -j{threads}
    cp samtools ../../{output.bin}
    """

SnakeMake SAMtools From line 248 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d minimap2 ]; then
        git clone https://github.com/lh3/minimap2 --branch v2.14 --depth=1 && cd minimap2
    else
        cd minimap2 && git fetch --all --tags --prune && git checkout tags/v2.14
    fi
    make clean && make -j{threads}
    cp minimap2 ../../{output.bin}
    """

SnakeMake Minimap2 From line 264 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d graphmap2 ]; then
        git clone https://github.com/lbcb-sci/graphmap2 --branch v0.6.4 --depth=1 && cd graphmap2
    else
        cd graphmap2 && git fetch --all --tags --prune && git checkout tags/v0.6.4
    fi
    make modules -j{threads} && make -j{threads}
    cp bin/*/graphmap2 ../../bin/
    """

SnakeMake graphmap2 From line 280 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d ngmlr ]; then
        git clone https://github.com/philres/ngmlr --branch v0.2.7 --depth=1 && cd ngmlr
    else
        cd ngmlr && git fetch --all --tags --prune && git checkout tags/v0.2.7
    fi
    mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release -G {config[build_generator]} .. && cmake --build . --config Release -- -j {threads}
    cp ../bin/*/ngmlr ../../../bin
    """

SnakeMake NGMLR From line 296 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d nanopolish ]; then
        #git clone --recursive https://github.com/jts/nanopolish --branch v0.11.0 --depth=1 && cd nanopolish
        git clone --recursive https://github.com/jts/nanopolish && cd nanopolish && git reset --hard 46a029c
    else
        #cd nanopolish && git fetch --all --tags --prune && git checkout tags/v0.11.0
        cd nanopolish && git fetch --all --tags --prune && git checkout 46a029c
    fi
    make clean
    set +e
    make -j{threads} 2>&1 | tee log.txt | grep 'g++\|gcc'
    exitcode=$?
    if [ $exitcode -ne 0 ]; then tail -n 100 log.txt; exit 1; fi
    cp nanopolish ../../bin/
    """

SnakeMake nanopolish From line 314 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d Sniffles ]; then
        git clone https://github.com/fritzsedlazeck/Sniffles --branch v1.0.10 --depth=1 && cd Sniffles
    else
        cd Sniffles && git fetch --all --tags --prune && git checkout tags/v1.0.10
    fi
    mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release -G{config[build_generator]} .. && cmake --build . --config Release -- -j {threads}
    cp ../bin/*/sniffles ../../../bin
    """

SnakeMake sniffles From line 336 of rules/install.smk

shell:
    """
    prefix=`pwd`
    mkdir -p src && cd src
    if [ ! -d svim ]; then
        git clone https://github.com/eldariont/svim.git --branch v1.4.0 --depth=1 && cd svim
    else
        cd svim && git fetch --all --tags --prune && git checkout tags/v1.4.0
    fi
    {config[python]} -m pip install .
    find {params.prefix}/bin {params.prefix}/local/bin -type f -name svim -exec cp {{}} ../../{output.bin} \; || true
    """

SnakeMake svim From line 353 of rules/install.smk

shell:
    """
    mkdir -p src/gocode
    export GOPATH=$(pwd)/src/gocode
    {input.go} env -w GO111MODULE=auto
    {input.go} get github.com/github/git-lfs
    cp src/gocode/bin/git-lfs bin/
    """

SnakeMake From line 391 of rules/install.smk

shell:
    """
    install_prefix=`pwd`
    mkdir -p src && cd src
    if [ ! -d OpenBLAS ]; then
        git clone https://github.com/xianyi/OpenBLAS --branch v0.3.4 --depth=1 && cd OpenBLAS
    else
        cd OpenBLAS && git fetch --all --tags --prune && git checkout tags/v0.3.4
    fi
    make clean && make NO_LAPACK=1 NOFORTRAN=1 -j{threads}
    make install PREFIX=$install_prefix
    """

SnakeMake From line 404 of rules/install.smk

shell:
    """
    install_prefix=`pwd`
    mkdir -p src && cd src
    if [ ! -d hdf5 ]; then
        git clone https://bitbucket.hdfgroup.org/scm/hdffv/hdf5.git --branch hdf5-1_8_20 --depth=1 && cd hdf5
    else
        cd hdf5 && git fetch --all --tags --prune && git checkout tags/hdf5-1_8_20
    fi
    mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release -DHDF5_ENABLE_Z_LIB_SUPPORT=ON -DHDF5_BUILD_TOOLS=OFF -DBUILD_TESTING=OFF -DHDF5_BUILD_EXAMPLES=OFF -DCMAKE_INSTALL_PREFIX=$install_prefix -G{config[build_generator]} ../
    cmake --build . --config Release -- -j {threads}
    cmake --build . --config Release --target install
    """

SnakeMake From line 421 of rules/install.smk

shell:
    """
    # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.1_linux64.tar.gz &&
    # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.5_linux64.tar.gz &&
    # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_2.3.7_linux64.tar.gz &&
    # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.0.3_linux64.tar.gz &&
    # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.1.5_linux64.tar.gz &&
    # wget https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_3.4.4_linux64.tar.gz &&
    mkdir -p src/guppy && cd src/guppy && rm -rf *
    wget -q https://mirror.oxfordnanoportal.com/software/analysis/ont-guppy-cpu_4.0.11_linux64.tar.gz
    tar -xzkf ont-guppy-cpu_4.0.11_linux64.tar.gz -C ./ --strip 1 && \
    rm ont-guppy-cpu_4.0.11_linux64.tar.gz
    # copy everything except toplevel softlinks e.g.
    # skip libhdf5.so
    # copy libhdf5.so.1.8.11
    rsync --files-from=<(find . ! \( -type l -and -regex '^.*so$' \) -print) --links . ../../
    rm -r *
    """

SnakeMake From line 468 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d pychopper ]; then
        git clone https://github.com/nanoporetech/pychopper --branch v2.4.0 && cd pychopper
    else
        cd pychopper && git fetch --all --tags --prune && git checkout tags/v2.4.0
    fi
    {config[python]} -m pip install --upgrade incremental
    {config[python]} -m pip install --upgrade certifi
    {config[python]} -m pip install parasail --upgrade
    {config[python]} -m pip install "matplotlib<3.1" --upgrade
    {config[python]} setup.py install
    find {params.prefix}/bin {params.prefix}/local/bin -type f -name cdna_classifier.py -exec cp {{}} ../../{output.bin} \; || true
    """

SnakeMake From line 492 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d racon ]; then
        git clone https://github.com/isovic/racon --recursive --branch 1.3.2 && cd racon
    else
        cd racon && git fetch --all --tags --prune && git checkout tags/1.3.2
    fi
    mkdir -p build && cd build && rm -rf * && cmake -DCMAKE_BUILD_TYPE=Release ..
    make
    cp bin/racon ../../../{output.bin}
    """

SnakeMake Racon From line 511 of rules/install.smk

shell:
    """
    mkdir -p src/gocode
    export PATH={params.go_dir}:$PATH
    export GOPATH=$(pwd)/src/gocode
    {input.go} env -w GO111MODULE=auto
    {input.go} get github.com/biogo/biogo/...
    {input.go} get github.com/biogo/hts/...
    {input.go} get -u gonum.org/v1/gonum/...
    {input.go} get github.com/google/uuid
    cd src
    if [ ! -d pinfish ]; then
        git clone https://github.com/nanoporetech/pinfish --branch master && cd pinfish
    else
        cd pinfish && git fetch --all --tags --prune && git checkout master
    fi
    cd cluster_gff && rm -f cluster_gff && make && cd ..
    cd collapse_partials && rm -f collapse_partials && make && cd ..
    cd polish_clusters && rm -f polish_clusters && make && cd ..
    cd spliced_bam2gff && rm -f spliced_bam2gff && make && cd ..
    cp cluster_gff/cluster_gff ../../bin/
    cp collapse_partials/collapse_partials ../../bin/
    cp polish_clusters/polish_clusters ../../bin/
    cp spliced_bam2gff/spliced_bam2gff ../../bin/
    """

SnakeMake From line 536 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d STRique ]; then
        git clone --recursive https://github.com/giesselmann/STRique --branch v0.3.0 && cd STRique
    else
        cd STRique && git fetch --all --tags --prune && git checkout tags/v0.3.0
    fi
    {config[python]} -m pip install -r requirements.txt --upgrade
    {config[python]} setup.py install
    cp scripts/STRique.py ../../bin/
    """

SnakeMake From line 567 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d Flye ]; then
        git clone https://github.com/fenderglass/Flye --branch 2.8.3 && cd Flye
    else
        cd Flye && git fetch --all --tags --prune && git checkout tags/2.8.3
    fi
    make
    {config[python]} setup.py install
    find {params.prefix}/bin {params.prefix}/local/bin -name flye -exec cp {{}} ../../{output.bin} \; || true
    """

SnakeMake Flye From line 585 of rules/install.smk

shell:
    """
    mkdir -p src && cd src
    if [ ! -d wtdbg2 ]; then
        git clone https://github.com/ruanjue/wtdbg2 --branch v2.5 && cd wtdbg2
    else
        cd wtdbg2 && git fetch --all --tags --prune && git checkout tags/v2.5
    fi
    make
    cp wtdbg2 ../../{output.asm}
    cp wtpoa-cns ../../{output.cns}
    """

SnakeMake wtdbg2 From line 604 of rules/install.smk

shell:
    """
    mkdir -p raw
    {config[bin_singularity][python]} {config[sbin_singularity][storage_fast5Index.py]} extract {input.batch} raw/ {params.index} --output_format lazy
    zcat {input.sequences} > sequences.fastx
    {config[bin_singularity][nanopolish]} index -d raw/ sequences.fastx
    {config[bin_singularity][nanopolish]} call-methylation -t {threads} -r sequences.fastx -g {input.reference} -b {input.bam} > tmp.tsv
    cat tmp.tsv | {config[bin_singularity][python]} {config[sbin_singularity][methylation_nanopolish.py]} | sort -k1,1 -k2,2n | gzip > {output[0]}
    if [ \'{params.raw_out}\' != '' ]; then
        cat tmp.tsv | gzip > {params.raw_out};
    fi
    """

SnakeMake nanopolish From line 102 of rules/methylation.smk

shell:
    """
    {config[bin_singularity][samtools]} view -F 4 {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][methylation_flappie.py]} align {input.reference} {input.seq} {input.tsv} | sort -k1,1 -k2,2n | gzip > {output}
    """

SnakeMake SAMtools From line 132 of rules/methylation.smk

shell:
    """
    {config[bin_singularity][samtools]} view -F 4 {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][basecalling_guppy_mod.py]} align {input.hdf5} --reference {input.reference} | sort -k1,1 -k2,2n | gzip > {output}
    """

SnakeMake SAMtools From line 153 of rules/methylation.smk

run:
    with gzip.open(output[0], 'wt') as fp_out:
        for i, f in enumerate(input):
            with gzip.open(f, 'rt') as fp_in:
                header, body = fp_in.read().split('\n', 1)
            # get header
            if i == 0:
                print(header, file=fp_out)
            print(body, file=fp_out, end='')

SnakeMake From line 164 of rules/methylation.smk

run:
    with open(output[0], 'wb') as fp_out:
        for f in sorted(input):     # sort by file name, you never know
            with open(f, 'rb') as fp_in:
                fp_out.write(fp_in.read())

SnakeMake From line 180 of rules/methylation.smk

run:
    with open(output[0], 'w') as fp_out:
        print('\n'.join([re.sub('.tsv.gz$','',f) for f in sorted(input)]), file=fp_out)

SnakeMake From line 191 of rules/methylation.smk

run:
    with open(output[0], 'w') as fp_out:
        print(''.join(open(f, 'r').read() for f in sorted(input)), file=fp_out, end='')

SnakeMake From line 200 of rules/methylation.smk

shell:
    """
    cat {input} | while read line; do zcat ${{line}}.tsv.gz; done | perl -anle 'print $_ if abs($F[6]) > {params.threshold}' | cut -f1-3,5 | sort -k1,1 -k2,2n | {config[bin_singularity][bedtools]} groupby -g 1,2,3 -c 4 -o mean,count | gzip > {output}
    """

SnakeMake BEDTools From line 216 of rules/methylation.smk

shell:
    """
    zcat {input} | perl -anle 'print $_ if $F[4] >= {params.methylation_min_coverage}' | cut -f1-4 > {output}
    """

SnakeMake From line 233 of rules/methylation.smk

shell:
    """
    {config[bin_singularity][bedGraphToBigWig]} {input.bedGraph} {input.chr_sizes} {output}
    """

SnakeMake bedGraphToBigWig From line 249 of rules/methylation.smk

shell:
    """
    {config[bin_singularity][samtools]} view -hF 4 {input.bam} | {config[bin_singularity][python]} {config[sbin_singularity][methylation_sr.py]} {params.reference} {input.seq} {input.tsv} --threshold {params.threshold} --mode IGV --polish | {config[bin_singularity][samtools]} view -b > {output.bam}
    {config[bin_singularity][samtools]} index {output.bam}
    """

SnakeMake SAMtools From line 273 of rules/methylation.smk

shell:
    """
    cat {input.fofn} | while read line; do echo ${{line}}.bam; done | split -l $((`ulimit -n` -10)) - {params.input_prefix}.part_
    for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done
    for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done
    {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads}
    {config[bin_singularity][samtools]} index {output.bam} -@ {threads}
    rm {params.input_prefix}.part_*
    """

SnakeMake SAMtools From line 292 of rules/methylation.smk

shell:
    """
    cat {input.fofn} | while read line; do echo ${{line}}.bam; done | split -l $((`ulimit -n` -10)) - {params.input_prefix}.part_
    for f in {params.input_prefix}.part_*; do {config[bin_singularity][samtools]} merge ${{f}}.bam -b ${{f}} -p -@ {threads}; done
    for f in {params.input_prefix}.part_*.bam; do {config[bin_singularity][samtools]} index ${{f}} -@ {threads}; done
    {config[bin_singularity][samtools]} merge {output.bam} {params.input_prefix}.part_*.bam -p -@ {threads}
    {config[bin_singularity][samtools]} index {output.bam} -@ {threads}
    rm {params.input_prefix}.part_*
    """

SnakeMake SAMtools From line 315 of rules/methylation.smk

shell:
    """
    {config[bin_singularity][python]} {config[sbin_singularity][methylation_1D2.py]} {input.pairs} {input.values} | gzip > {output}
    """

SnakeMake From line 336 of rules/methylation.smk

run:
    import os
    import pandas as pd
    from pathlib import Path

    def du(root_dir='.'):
        root_dir = Path(root_dir)
        return sum(f.stat().st_size for f in root_dir.glob('**/*') if f.is_file())

    raw = sum(du(os.path.join(config['storage_data_raw'], runname)) for runname in config['runnames'])
    modules = [du(m) for m in input.modules]
    df = pd.DataFrame(data={'name' : ['raw'] + input.modules, 'bytes' : [raw] + modules})
    df.to_hdf(output[0], 'disk_usage')

SnakeMake Pandas From line 80 of rules/report.smk

run:
    import os, glob
    import snakemake
    import numpy as np
    import pandas as pd
    import matplotlib as mpl
    mpl.use('Agg')
    from matplotlib import pyplot as plt
    import seaborn as sns

    configure_seaborn(sns)

    # stats per workflow and tag
    workflows = snakemake.utils.listfiles('sequences/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}')
    workflows = sorted(list(workflows), key=lambda x: x[1].sequence_workflow)
    run_id = {runname:i for i, runname in enumerate(config['runnames'])}
    os.makedirs(output.stats, exist_ok=True)
    os.makedirs(output.plot, exist_ok=True)

    # plots
    summary = []
    df_list = []
    def n50(df):
        return df[df.throughput > (df.throughput.iloc[-1] / 2)].length.iloc[0]
    for sequence_workflow, values in itertools.groupby(workflows, lambda x: x[1].sequence_workflow):
        f, ax = plt.subplots(ncols=2, figsize=(10,5), constrained_layout=True)
        values = list(values)
        n_groups = len(values)
        for tag_folder, tag_wildcards in values:
            tag_inputs = list(snakemake.utils.listfiles('sequences/{sequence_workflow}/batches/{tag}/{runname}.hdf5'.format(sequence_workflow=sequence_workflow, tag=tag_wildcards.tag, runname='{runname, [^\/]*}')))
            if not tag_inputs:
                continue
            df = pd.concat([pd.read_hdf(f).assign(
                    Flowcell=lambda x: run_id[wc.runname],
                    Tag=lambda x: tag_wildcards.tag,
                    Basecalling=lambda x: sequence_workflow) for f, wc in tag_inputs if wc.runname in config['runnames']])
            df_list.append(df)
            df.sort_values(by='length', inplace=True)
            df['throughput'] = df.length.cumsum()
            xlim = df[df.throughput > (0.90 * df.throughput.iloc[-1])].length.iloc[0]
            xlim *= 1.1
            label = '{tag}'.format(tag=tag_wildcards.tag)
            # read length histogramm up to 90% of throughput
            sns.distplot(df.length, kde=False,
                hist_kws={'range': (0, xlim), 'alpha': 0.5 + (0.5 / n_groups)},
                label=label,
                ax=ax[0])
            # cumulative throughput over read length
            sns.lineplot(x='length', y='throughput',
                label=label,
                data=df[df.length < xlim].iloc[np.linspace(0, df[df.length < xlim].shape[0]-1, 1000).astype(int)],
                ax=ax[1])
            summary.append((tag_wildcards.tag, sequence_workflow,
                            df.length.sum(),
                            df.length.mean(), df.length.median(),
                            n50(df), df.length.max()
                            ))
        ax[0].set_xlabel('Read length')
        ax[0].set_ylabel('Reads')
        ax[0].legend()
        ax[1].set_xlabel('Read length')
        ax[1].set_ylabel('Throughput')
        ax[1].legend()
        f.suptitle("Read length distribution and cumulative throughput")
        f.savefig(os.path.join(output.plot, 'sequences1_{}.svg'.format(sequence_workflow)))

    if len(summary):
        df_total = pd.concat(df_list)
        df_total_agg = df_total.groupby(['Basecalling', 'Tag', 'Flowcell']).agg(
            Total=('length', 'sum')
        ).reset_index()
        for bc in df_total_agg.Basecalling.unique():
            f, ax = plt.subplots(nrows=2, figsize=(10,5), sharex=True, constrained_layout=True)
            # Barplot with throughput per flow cell
            sns.barplot(x='Flowcell', y='Total', hue='Tag', data=df_total_agg[df_total_agg.Basecalling == bc], errwidth=0, ax=ax[0])
            ax[0].set_ylabel('Throughput')
            ax[0].set_xlabel('')
            # Read length distribution per flow cell
            sns.boxplot(x="Flowcell", y="length", hue="Tag", showfliers=False, data=df_total[df_total.Basecalling == bc], ax=ax[1])
            ax[1].set_ylabel('Read length')
            ax[1].set_xlabel('Flow cell')
            f.suptitle("Total basepairs and read lengths per flow cell")
            f.savefig(os.path.join(output.plot, 'sequences2_{}.svg'.format(bc)))
        df_summary = pd.DataFrame(summary, columns=['Tag', 'Basecalling', 'Sum', 'Mean', 'Median', 'N50', 'Maximum'])
        df_summary.to_hdf(os.path.join(output.stats, 'sequences.hdf5'), 'sequences')

SnakeMake Snakemake Pandas numpy matplotlib seaborn From line 104 of rules/report.smk

run:
    import itertools
    import snakemake
    import numpy as np
    import pandas as pd
    import matplotlib as mpl
    mpl.use('Agg')
    from matplotlib import pyplot as plt
    import seaborn as sns

    configure_seaborn(sns)
    os.makedirs(output.plot, exist_ok=True)

    run_id = {runname:i for i, runname in enumerate(config['runnames'])}
    run_files = sorted(list(snakemake.utils.listfiles('alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/batches/{tag, [^\/]*}/{runname, [^.\/]*}.{reference, [^.]*}.hdf5')), key=lambda x: (x[1].sequence_workflow, x[1].tag))
    df_list = []
    for (sequence_workflow, tag), values in itertools.groupby(run_files, key=lambda x: (x[1].sequence_workflow, x[1].tag)):
        df = pd.concat([pd.read_hdf(f).assign(
                Basecaller=lambda x: sequence_workflow,
                Aligner=lambda x: w.aligner,
                Tag=lambda x: w.tag,
                Flowcell=lambda x: run_id[w.runname],
                Reference=lambda x: w.reference) for f, w in values])
        df['Mapping'] = df.flag.apply(lambda x : 'unmapped' if (x & 0x4) else 'secondary' if (x & 0x100) or (x & 0x800) else 'primary' if (x == 0) or (x == 0x10) else 'unknown')
        if 'identity' in df.columns:
            df_agg = df.groupby(['Basecaller', 'Tag', 'Aligner', 'Flowcell', 'Reference', 'Mapping', 'ID']).agg(
                length=('length', 'first'),
                mapped_length=('mapped_length', 'median'),
                Identity=('identity', 'median')
            ).reset_index().drop(columns=['ID'])
        else:
            df_agg = df.groupby(['Basecaller', 'Tag', 'Aligner', 'Flowcell', 'Reference', 'Mapping', 'ID']).agg(
                length=('length', 'first'),
                mapped_length=('mapped_length', 'median')
            ).reset_index().drop(columns=['ID'])
            df_agg['Identity'] = 0
        df_list.append(df_agg)
        # mapping rate by read numbers
        g = sns.catplot("Mapping", hue='Aligner', col="Reference",
            data=df_agg, kind="count", height=5, aspect=1.0)
        g.fig.subplots_adjust(wspace=.05, hspace=0.5)
        (g.set_axis_labels("Mapping", "Reads"))
        g.savefig(os.path.join(output.plot, 'alignments_counts_{}_{}.svg'.format(sequence_workflow, tag)))
    # mapping rate by cumulative length
    df_agg = pd.concat(df_list)
    df_agg['Mapped %'] = df_agg['mapped_length'] / df_agg['length']
    df_agg_primary = df_agg[df_agg.Mapping == 'primary'].copy()
    df_agg_primary.sort_values(by=['Tag', 'Basecaller', 'Aligner', 'Reference', 'length'], inplace=True)
    df_agg_primary_grouper = df_agg_primary.groupby(['Tag', 'Basecaller', 'Aligner', 'Reference'], sort=False)
    df_agg_primary = df_agg_primary.assign(**{'Sequenced Bases' : df_agg_primary_grouper['length'].cumsum(), 'Mapped Bases': df_agg_primary_grouper['mapped_length'].cumsum()})
    xlim = df_agg_primary[df_agg_primary['Mapped Bases'] > (0.95 * df_agg_primary['Mapped Bases'].max())].length.iloc[0]
    xlim *= 1.1
    df_agg_primary.set_index('Basecaller', inplace=True)
    for bc in df_agg_primary.index.unique():
        df_subset = df_agg_primary.loc[bc]
        g = sns.relplot(x="length", y="Mapped Bases", hue='Aligner', style='Tag', col="Reference",
            data=df_subset[df_subset.length < xlim].iloc[np.linspace(0, df_subset[df_subset.length < xlim].shape[0]-1, 1000).astype(int)], kind='line', height=5, aspect=1.0)
        g.fig.subplots_adjust(wspace=.05, hspace=0.5)
        (g.set_axis_labels("Read length", "Mapped Bases"))
        g.savefig(os.path.join(output.plot, 'alignments_bases_{}.svg'.format(bc)))
        # read identity if available
        # TODO this will fail if one aligner sets the NM-tag and the other not
        if df_subset.Identity.sum() > 0:
            g = sns.catplot(x="Flowcell", y="Identity", hue='Tag', row="Reference",
                data=df_subset, kind='violin', kwargs={'cut': 0}, height=2.5, aspect=4)
            g.fig.subplots_adjust(wspace=.05, hspace=0.5)
            (g.set_axis_labels("Flow cell", "BLAST identity")
            .set(ylim=(0,1)))
            g.savefig(os.path.join(output.plot, 'alignments_identity_{}.svg'.format(bc)))

SnakeMake Snakemake Pandas numpy matplotlib seaborn From line 199 of rules/report.smk

run:
    import re
    import pandas as pd
    import matplotlib as mpl
    mpl.use('Agg')
    from matplotlib import pyplot as plt
    import seaborn as sns

    configure_seaborn(sns)
    os.makedirs(output.plot, exist_ok=True)

    for cov_file, cov_wc in snakemake.utils.listfiles('alignments/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.]*}.bedGraph'):
        df = pd.read_csv(cov_file, sep='\t', header=None, names=['chr', 'begin', 'end', 'coverage'])
        df['bin'] = df['begin'] // (df.begin.max() / 1000)
        def sort_key(k):
            return ''.join(re.findall(r'\D+', k) + ['{:05d}'.format(int(m)) for m in re.findall(r'\d+', k)])

        df_agg = df.groupby(['chr', 'bin']).agg(
            pos=('begin', 'first'),
            end=('end', 'last'),
            coverage=('coverage', 'median')
        ).reset_index()
        df_agg['key'] = df_agg.chr.map(sort_key)
        df_agg = df_agg.sort_values(by=['key', 'pos']).drop(columns=['key'])
        # coverage per chromosome
        g = sns.FacetGrid(df_agg, col="chr", hue='chr', col_wrap=4, sharex=True, sharey=True, height=1, aspect=2.5)
        g = (g.map(plt.plot, "pos", "coverage", marker=".", lw=0, markersize=0.2)
            .set(ylim=(0, df_agg.coverage.mean() * 2.0))
            .set_titles("{col_name}"))
        g.fig.subplots_adjust(wspace=.05, hspace=0.5)
        g.savefig(os.path.join(output.plot, 'coverage_{}_{}_{}_{}.svg'.format(
            cov_wc.aligner, cov_wc.sequence_workflow, cov_wc.tag, cov_wc.reference)))

SnakeMake Pandas matplotlib seaborn From line 278 of rules/report.smk

run:
    import os, itertools
    import snakemake
    import pandas as pd
    import matplotlib as mpl
    mpl.use('Agg')
    from matplotlib import pyplot as plt
    import seaborn as sns

    configure_seaborn(sns)
    os.makedirs(output.plot, exist_ok=True)

    workflows = snakemake.utils.listfiles('methylation/{methylation_caller, [^.\/]*}/{aligner, [^.\/]*}/{sequence_workflow, ((?!batches).)*}/{tag, [^\/]*}.{reference, [^.\/]*}.frequencies.tsv.gz')
    df_list = []
    df_agg_list = []
    for frequencies, wildcards in workflows:
        df = pd.read_csv(frequencies, sep='\t', header=None, compression='gzip', names=['chr', 'begin', 'end', 'level', 'coverage'])
        df.sort_values('coverage', inplace=True)
        df_agg = df.groupby('coverage', sort=False).agg(
            count=('coverage', 'count')
        ).reset_index().sort_values('coverage', ascending=False)
        df_agg['total'] = df_agg['count'].cumsum()
        df_agg['Methylation caller'] = wildcards.methylation_caller
        df_agg['Aligner'] = wildcards.aligner
        df_agg['Basecaller'] = wildcards.sequence_workflow
        df_agg['Tag'] = wildcards.tag
        df_agg['Reference'] = wildcards.reference
        df_agg_list.append(df_agg)
    # plots
    if df_agg_list:
        df_agg = pd.concat(df_agg_list)
        df_agg.set_index(['Basecaller', 'Aligner'], inplace=True)
        for basecaller, aligner in df_agg.index.unique():
            df_subset = df_agg.loc[basecaller, aligner]
            g = sns.relplot(x="coverage", y="total",
                 col="Reference", hue="Methylation caller", style="Tag",
                 kind="line", height=5, aspect=1.0,
                 data=df_subset[df_subset.total > (df_subset.total.max() * 0.05)])
            g.fig.subplots_adjust(wspace=.05, hspace=0.5)
            (g.set_axis_labels("Coverage threshold", "CpGs"))
            g.savefig(os.path.join(output.plot, 'coverage_{}_{}.svg'.format(aligner, basecaller)))

SnakeMake Snakemake Pandas matplotlib seaborn From line 320 of rules/report.smk

run:
    import os
    import pandas as pd
    report = nanopype_report(os.getcwd(), output.pdf, version=config['version']['full-tag'])
    # summary
    summary_table = []
    summary_table.append(('{:d}'.format(len(config['runnames'])), 'Flow cells'))
    f_sequences_stats = 'report/stats/sequences/sequences.hdf5'
    df_sequences = pd.read_hdf(f_sequences_stats) if os.path.isfile(f_sequences_stats) else None
    if df_sequences is not None:
        for field, label, scale, unit in zip(['Sum', 'N50'], ['Total bases', 'N50'], [1e9, 1e3], ['GB', 'kB']):
            if df_sequences.shape[0] < 2:
                summary_table.append(('{:.1f} {}'.format(df_sequences[field][0] / scale, unit), label))
            else:
                summary_table.append(('{:.1f} to {:.1f} {}'.format(df_sequences[field].min() / scale, df_sequences[field].max() / scale, unit), label))
    df_disk_usage = pd.read_hdf(input.disk_usage[0])
    summary_table.append(('{:.2f} GB'.format(df_disk_usage.bytes.sum() / 1e9), 'Total disk usage'))
    report.add_summary(summary_table)
    # flow cells
    report.add_section_flowcells(config['runnames'])
    # sequences
    f_sequences_plots = snakemake.utils.listfiles('report/plots/sequences/sequences{i}_{basecalling}.svg')
    report.add_section_sequences(f_sequences_plots, df_sequences)
    # alignments
    f_alignments_counts = [(x[0], "Basecalling: {} ({})".format(x[1].sequence_workflow, x[1].tag)) for x in snakemake.utils.listfiles('report/plots/alignments/alignments_counts_{sequence_workflow, [^_]*}_{tag}.svg')]
    f_alignments_bases = [(x[0], "Basecalling: {}".format(x[1].sequence_workflow)) for x in snakemake.utils.listfiles('report/plots/alignments/alignments_bases_{sequence_workflow}.svg')]
    f_alignments_identity = [(x[0], "Basecalling: {}".format(x[1].sequence_workflow)) for x in snakemake.utils.listfiles('report/plots/alignments/alignments_identity_{sequence_workflow}.svg')]
    f_alignments_coverage = [x[0] for x in snakemake.utils.listfiles('report/plots/coverage/coverage_{tag}.svg')]
    report.add_section_alignments(counts=f_alignments_counts,
                    bases=f_alignments_bases,
                    identity=f_alignments_identity,
                    coverage=f_alignments_coverage)
    # methylation
    f_methylation_coverage = [(x[0], "Basecalling: {}, Alignment: {}".format(x[1].sequence_workflow, x[1].aligner)) for x in snakemake.utils.listfiles('report/plots/methylation/coverage_{aligner}_{sequence_workflow}.svg')]
    report.add_section_methylation(coverage=f_methylation_coverage)
    # build report pdf
    report.build()

SnakeMake Pandas From line 375 of rules/report.smk

shell:
    """
    {config[bin][python]} {config[sbin][storage_fast5Index.py]} index {input.batch} --out_prefix reads --tmp_prefix $(pwd) > {output}
    """

SnakeMake From line 59 of rules/storage.smk

run:
    with open(output[0], 'w') as fp:
        for f in input.batches:
            print(open(f, 'r').read(), end='', file=fp)

SnakeMake From line 70 of rules/storage.smk

shell:
    """
    mkdir -p {output}
    {config[bin][python]} {config[sbin][storage_fast5Index.py]} extract {input.names} {output} --index {input.index} --output_format bulk
    """

SnakeMake From line 88 of rules/storage.smk

run:
    print(input.files)

SnakeMake From line 102 of rules/storage.smk

shell:
    """
    {config[bin_singularity][sniffles]} -m {input} -v {output} -t {threads} {config[sv_sniffles_flags]}
    """

SnakeMake sniffles From line 69 of rules/sv.smk

shell:
    """
    {config[bin_singularity][svim]} alignment sample {input.alignment} {input.reference} {config[sv_svim_flags]}
    mv sample/variants.vcf {output}
    """

SnakeMake svim From line 89 of rules/sv.smk

shell:
    """
    cat {input} | gzip > {output}
    """

SnakeMake From line 106 of rules/sv.smk

shell:
    """
    export TMPDIR=$(pwd)
    {config[bin_singularity][samtools]} view -F 2308 {input.bam} | {config[bin_singularity][python]} {config[bin_singularity][strique]} count {input.index} {params.model} {input.config} {params.mod_model} --t {threads} > {output}
    """

SnakeMake SAMtools From line 131 of rules/sv.smk

run:
    with open(output[0], 'w') as fp_out:
        for i, f in enumerate(input):
            with open(f, 'r') as fp_in:
                if i == 0:
                    print(fp_in.read(), file=fp_out, end='')
                else:
                    next(fp_in)
                    print(fp_in.read(), file=fp_out, end='')

SnakeMake From line 142 of rules/sv.smk

run:
    with open(output[0], 'w') as fp_out:
        for i, f in enumerate(input):
            with open(f, 'r') as fp_in:
                if i == 0:
                    print(fp_in.read(), file=fp_out, end='')
                else:
                    next(fp_in)
                    print(fp_in.read(), file=fp_out, end='')

SnakeMake From line 157 of rules/sv.smk

shell:
    """
    mkfifo input_seq
    mkfifo output_seq
    zcat {input.seq} > input_seq &
    {config[bin_singularity][python]} {config[bin_singularity][cdna_classifier]} -b {input.prmr} input_seq output_seq &
    cat output_seq | gzip > {output.seq}
    """

SnakeMake From line 50 of rules/transcript.smk

shell:
    """
    racon_dir=`dirname {config[bin_singularity][racon]}`
    minimap_dir=`dirname {config[bin_singularity][minimap2]}`
    samtools_dir=`dirname {config[bin_singularity][samtools]}`
    PATH=$racon_dir:$minimap_dir:samtools_dir:$PATH
    TMPDIR=`pwd`
    echo 'splicing initial alignment'
    {config[bin_singularity][spliced_bam2gff]} {config[transcript_spliced_bam2gff_flags]} -M -t {threads} {input.bam} > raw_transcript.gff
    echo 'cluster gff'
    {config[bin_singularity][cluster_gff]} {config[transcript_cluster_gff_flags]} -t {threads} -a cluster_transcript.tsv raw_transcript.gff > cluster_transcript.gff
    echo 'collapse cluster'
    {config[bin_singularity][collapse_partials]} {config[transcript_collapse_partials]} -t {threads} cluster_transcript.gff > cluster_collapsed.gff
    echo 'polish collapsed cluster'
    {config[bin_singularity][polish_clusters]} {config[transcript_polish_clusters]} -d $(pwd) -a cluster_transcript.tsv -t {threads} -o cluster_polish.fasta {input.bam}
    echo 'map collapsed polished cluster to reference'
    {config[bin_singularity][minimap2]} -t {threads} {config[alignment_minimap2_flags]} {input.reference} cluster_polish.fasta |         {config[bin_singularity][samtools]} view -Sb -F 2304 | {config[bin_singularity][samtools]} sort -o cluster_polish.{wildcards.reference}.bam -
    {config[bin_singularity][samtools]} index cluster_polish.{wildcards.reference}.bam
    {config[bin_singularity][spliced_bam2gff]} {config[transcript_spliced_bam2gff_flags]} -M -t {threads} cluster_polish.{wildcards.reference}.bam > cluster_polish.gff
    echo 'collapse mapped transcripts'
    {config[bin_singularity][collapse_partials]} {config[transcript_collapse_partials]} -t {threads} cluster_polish.gff > {output.gff}
    """