BioWorkflows: ATLAS - Three commands to start analyzing your metagenome data

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.captureWarnings(True)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts

from common_report import *

import os, sys
import pandas as pd
import plotly.express as px


labels = {
    "Percent_Assembled_Reads": "Percent of Assembled Reads",
    "contig_bp": "Total BP",
    "n_contigs": "Contigs (count)",
    "N_Predicted_Genes": "Predicted Genes (count)",
    "N50": "N50-number",
    "L50": "N50-length (bp)",
    "N90": "N90-number",
    "L90": "N90-length (bp)",
}


PLOT_PARAMS = dict(labels=labels)


def make_plots(combined_stats):
    ## Make figures with PLOTLY
    # load and rename data
    df = pd.read_csv(combined_stats, sep="\t", index_col=0)
    df.sort_index(ascending=True, inplace=True)
    df.index.name = "Sample"
    df["Sample"] = df.index

    # create plots store in div
    div = {}

    fig = px.strip(df, y="Percent_Assembled_Reads", hover_name="Sample", **PLOT_PARAMS)
    fig.update_yaxes(range=[0, 100])
    div["Percent_Assembled_Reads"] = fig.to_html(**HTML_PARAMS)

    fig = px.strip(df, y="N_Predicted_Genes", hover_name="Sample", **PLOT_PARAMS)
    div["N_Predicted_Genes"] = fig.to_html(**HTML_PARAMS)

    fig = px.scatter(df, y="L50", x="N50", hover_name="Sample", **PLOT_PARAMS)
    div["N50"] = fig.to_html(**HTML_PARAMS)

    fig = px.scatter(df, y="L90", x="N90", hover_name="Sample", **PLOT_PARAMS)
    div["N90"] = fig.to_html(**HTML_PARAMS)

    fig = px.scatter(
        df, y="contig_bp", x="n_contigs", hover_name="Sample", **PLOT_PARAMS
    )
    div["Total"] = fig.to_html(**HTML_PARAMS)

    return div


# main


div = make_plots(combined_stats=snakemake.input.combined_contig_stats)


make_html(
    div=div,
    report_out=snakemake.output.report,
    html_template_file=os.path.join(reports_dir, "template_assembly_report.html"),
)

Python Pandas plotly From line 1 of report/assembly_report.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.captureWarnings(True)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts


from common_report import *

import pandas as pd
import plotly.express as px


from utils.taxonomy import tax2table


def make_plots(bin_info):
    div = {}

    div["input_file"] = f"{bin_info} and {snakemake.input.bins2species}"

    # Prepare data
    df = pd.read_table(bin_info, index_col=0)
    df["Bin Id"] = df.index  # need it also as column

    # add species info
    bin2species = pd.read_table(snakemake.input.bins2species, index_col=0)
    df = df.join(bin2species)

    logging.info(df.head())

    logging.info(bin2species.head())

    # calculate number of genomes/bins
    st = pd.DataFrame(columns=["Bins", "Species"])

    def add_stats(name, d):
        st.loc[name, "Bins"] = d.shape[0]
        st.loc[name, "Species"] = d.Representative.unique().shape[0]

    add_stats("All", df)

    df.eval("Quality_score = Completeness - 5* Contamination", inplace=True)
    div[
        "QualityScore"
    ] = "<p>Quality score is calculated as: Completeness - 5 x Contamination.</p>"
    add_stats("Quality score >50 ", df.query("Quality_score>50"))
    add_stats("Good quality", df.query("Completeness>90 & Contamination <5"))
    add_stats("Quality score >90 ", df.query("Quality_score>90"))

    div["table"] = st.to_html()

    logging.info(df.describe())

    # Bin Id  Completeness    completeness_general    Contamination   completeness_specific   completeness_model_used translation_table_used  coding_density  contig_n50      average_gene_length      genome_size     gc_content      total_coding_sequences  additional_notes        quality_score   sample  Ambigious_bases Length_contigs  Length_scaffolds N50     N_contigs       N_scaffolds     logN50
    hover_data = [
        "Completeness_Model_Used",
        "Coding_Density",
        "N50",
        "GC_Content",
    ]
    size_name = "Genome_Size"

    lineage_name = "Species"

    # 2D plot

    logging.info("make 2d plot")
    fig = px.scatter(
        data_frame=df,
        y="Completeness",
        x="Contamination",
        color=lineage_name,
        size=size_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    fig.update_xaxes(range=(-0.2, 10.1))
    div["2D"] = fig.to_html(**HTML_PARAMS)

    # 2D plot

    logging.info("make 2d plot species")
    fig = px.scatter(
        data_frame=df.loc[df.Representative.unique()],
        y="Completeness",
        x="Contamination",
        color=lineage_name,
        size=size_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    fig.update_xaxes(range=(-0.2, 10.1))
    div["2Dsp"] = fig.to_html(**HTML_PARAMS)

    ## By sample
    logging.info("plot  by sample")
    fig = px.strip(
        data_frame=df,
        y="Quality_score",
        x="Sample",
        color=lineage_name,
        hover_data=hover_data,
        hover_name="Bin Id",
    )
    fig.update_yaxes(range=(50, 102))
    div["bySample"] = fig.to_html(**HTML_PARAMS)

    # # By species
    # logging.info("plot by species")
    # fig = px.strip(
    #     data_frame=df,
    #     y="Quality_score",
    #     x=lineage_name,
    #     hover_data=hover_data,
    #     hover_name="Bin Id",
    # )
    # fig.update_yaxes(range=(50, 102))
    # div["byPhylum"] = fig.to_html(**HTML_PARAMS)

    return div


# main


div = make_plots(bin_info=snakemake.input.bin_info)


make_html(
    div=div,
    report_out=snakemake.output.report,
    html_template_file=os.path.join(reports_dir, "template_bin_report.html"),
    wildcards=snakemake.wildcards,
)

Python Pandas utils plotly From line 1 of report/bin_report.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts

from common_report import *


import pandas as pd
import plotly.express as px
from plotly import subplots
import plotly.graph_objs as go
import numpy as np


labels = {"Total_Reads": "Total Reads", "Total_Bases": "Total Bases"}


PLOT_PARAMS = dict(labels=labels)


import zipfile


def get_stats_from_zips(zips, samples):
    # def get_read_stats(samples, step):
    quality_pe = pd.DataFrame()
    quality_se = pd.DataFrame()
    for zfile, sample in zip(zips, samples):
        zf = zipfile.ZipFile(zfile)

        # single end only
        if "boxplot_quality.txt" in zf.namelist():
            with zf.open("boxplot_quality.txt") as f:
                df = pd.read_csv(f, index_col=0, sep="\t")
                quality_se[sample] = df.mean_1
        else:
            if "se/boxplot_quality.txt" in zf.namelist():
                with zf.open("se/boxplot_quality.txt") as f:
                    df = pd.read_csv(f, index_col=0, sep="\t")
                    quality_se[sample] = df.mean_1

            if "pe/boxplot_quality.txt" in zf.namelist():
                with zf.open("pe/boxplot_quality.txt") as f:
                    df = pd.read_csv(f, index_col=0, sep="\t")
                    df.columns = [df.columns, [sample] * df.shape[1]]

                    quality_pe = pd.concat(
                        (quality_pe, df[["mean_1", "mean_2"]]), axis=1
                    )

    return quality_pe, quality_se


def get_pe_read_quality_plot(df, quality_range, color_range):
    fig = subplots.make_subplots(cols=2)

    for i, sample in enumerate(df["mean_1"].columns):
        fig.append_trace(
            go.Scatter(
                x=df.index,
                y=df["mean_1"][sample].values,
                type="scatter",
                name=sample,
                legendgroup=sample,
                marker=dict(color=color_range[i]),
            ),
            1,
            1,
        )

        fig.append_trace(
            dict(
                x=df.index,
                y=df["mean_2"][sample].values,
                type="scatter",
                name=sample,
                legendgroup=sample,
                showlegend=False,
                marker=dict(color=color_range[i]),
            ),
            1,
            2,
        )

    fig.update_layout(
        yaxis=dict(range=quality_range, autorange=True, title="Average quality score"),
        xaxis1=dict(title="Position forward read"),
        xaxis2=dict(autorange="reversed", title="Position reverse read"),
    )

    return fig


def draw_se_read_quality(df, quality_range, color_range):
    fig = subplots.make_subplots(cols=1)

    for i, sample in enumerate(df.columns):
        fig.append_trace(
            go.Scatter(
                x=df.index,
                y=df[sample].values,
                type="scatter",
                name=sample,
                legendgroup=sample,
                marker=dict(color=color_range[i]),
            ),
            1,
            1,
        )

    fig.update_layout(
        yaxis=dict(range=quality_range, autorange=True, title="Average quality score"),
        xaxis=dict(title="Position read"),
    )
    return fig


def make_plots(
    samples, zipfiles_QC, read_counts, read_length, min_quality, insert_size_stats
):
    div = {}

    ## Quality along read

    N = len(samples)
    color_range = [
        "hsl(" + str(h) + ",50%" + ",50%)" for h in np.linspace(0, 360, N + 1)
    ]

    # load quality profiles for QC and low
    Quality_QC_pe, Quality_QC_se = get_stats_from_zips(zipfiles_QC, samples)
    # Quality_raw_pe, Quality_raw_se = get_stats_from_zips(zipfiles_QC,samples)

    # detrmine range of quality values and if paired
    max_quality = 1 + np.nanmax((Quality_QC_pe.max().max(), Quality_QC_se.max().max()))
    quality_range = [min_quality, max_quality]

    paired = Quality_QC_pe.shape[0] > 0

    # create plots if paired or not

    if paired:
        div["quality_QC"] = get_pe_read_quality_plot(
            Quality_QC_pe, quality_range, color_range
        ).to_html(**HTML_PARAMS)

    #     div["quality_raw"] = get_pe_read_quality_plot(
    #         Quality_raw_pe, quality_range, color_range
    #     ).to_html(**HTML_PARAMS)

    else:
        div["quality_QC"] = draw_se_read_quality(
            Quality_QC_se, quality_range, color_range
        ).to_html(**HTML_PARAMS)

    #     div["quality_raw"] = draw_se_read_quality(
    #         Quality_raw_se, quality_range, color_range
    #     ).to_html(**HTML_PARAMS)

    # Total reads plot

    df = pd.read_csv(read_counts, index_col=[0, 1], sep="\t")

    try:
        df.drop("clean", axis=0, level=1, inplace=True)
    except KeyError:
        pass

    data_qc = df.query('Step=="QC"').reset_index()

    for var in ["Total_Reads", "Total_Bases"]:
        fig = px.strip(data_qc, y=var, hover_data=["Sample", var], **PLOT_PARAMS)
        fig.update_yaxes(range=(0, data_qc[var].max() * 1.1))
        div[var] = fig.to_html(**HTML_PARAMS)

    ## reads plot across different steps

    total_reads = df.Total_Reads.unstack()

    fig = px.bar(
        data_frame=total_reads,
        barmode="group",
        labels={"value": "Reads"},
        category_orders={"Step": ["raw", "deduplicated", "filtered", "QC"]},
    )

    fig.update_yaxes(title="Number of reads")
    fig.update_xaxes(tickangle=45)
    # fig.update_layout(hovermode="x unified")

    div["Reads"] = fig.to_html(**HTML_PARAMS)

    ## Read length plot

    data_length = pd.read_table(read_length, index_col=0).T
    data_length.index.name = "Sample"

    fig = px.bar(
        data_frame=data_length,
        x="Median",
        error_x="Max",
        error_x_minus="Min",
        hover_data=["Median", "Max", "Min", "Avg", "Std_Dev", "Mode"],
    )

    fig.update_xaxes(title="Read length")

    div["Length"] = fig.to_html(**HTML_PARAMS)

    ### Insert insert_size_stats
    if insert_size_stats is None:
        div[
            "Insert"
        ] = "<p>Insert size information is not available for single end reads.</p>"
    else:
        data_insert = pd.read_table(insert_size_stats, index_col=0)
        data_insert.index.name = "Sample"

        fig = px.bar(
            data_frame=data_insert,
            x="Mean",
            error_x="STDev",
            hover_data=["Mean", "Median", "Mode", "PercentOfPairs"],
            labels={"PercentOfPairs": "Percent of pairs"},
        )

        fig.update_xaxes(title="Insert size")

        div["Insert"] = fig.to_html(**HTML_PARAMS)

    return div


# If paired we have information about insert size
if type(snakemake.input.read_length_stats) == str:
    read_length_path = snakemake.input.read_length_stats
    insert_size_stats = None
else:
    read_length_path, insert_size_stats = snakemake.input.read_length_stats

div = make_plots(
    samples=snakemake.params.samples,
    zipfiles_QC=snakemake.input.zipfiles_QC,
    read_counts=snakemake.input.read_counts,
    read_length=read_length_path,
    min_quality=snakemake.params.min_quality,
    insert_size_stats=insert_size_stats,
)

make_html(
    div=div,
    report_out=snakemake.output.report,
    html_template_file=os.path.join(reports_dir, "template_QC_report.html"),
)

Python Pandas numpy plotly From line 1 of report/qc_report.py

shell:
    """
    reformat.sh {params.inputs} \
        interleaved={params.interleaved} \
        {params.outputs} \
        iupacToN=t \
        touppercase=t \
        qout=33 \
        overwrite=true \
        verifypaired={params.verifypaired} \
        addslash=t \
        trimreaddescription=t \
        threads={threads} \
        pigz=t unpigz=t \
        -Xmx{resources.java_mem}G 2> {log}
    """

SnakeMake From line 56 of rules/assemble.smk

run:
    # make symlink
    assert len(input) == len(
        output
    ), "Input and ouput files have not same number, can not create symlinks for all."
    for i in range(len(input)):
        os.symlink(os.path.abspath(input[i]), output[i])

SnakeMake From line 90 of rules/assemble.smk

shell:
    " bbnorm.sh {params.inputs} "
    " {params.outputs} "
    " tmpdir={resources.tmpdir} "
    " tossbadreads=t "
    " hist={output.histin} "
    " histout={output.histout} "
    " mindepth={params.mindepth} "
    " k={params.k} "
    " target={params.target} "
    " prefilter=t "
    " threads={threads} "
    " -Xmx{resources.java_mem}G &> {log} "

SnakeMake From line 135 of rules/assemble.smk

shell:
    "tadpole.sh -Xmx{resources.java_mem}G "
    " prefilter={params.prefilter} "
    " prealloc=1 "
    " {params.inputs} "
    " {params.outputs} "
    " mode=correct "
    " aggressive={params.aggressive} "
    " tossjunk={params.tossjunk} "
    " lowdepthfraction={params.lowdepthfraction}"
    " tossdepth={params.tossdepth} "
    " merge=t "
    " shave={params.shave} rinse={params.shave} "
    " threads={threads} "
    " pigz=t unpigz=t "
    " ecc=t ecco=t "
    "&> {log} "

SnakeMake TADpole From line 183 of rules/assemble.smk

shell:
    """
    bbmerge.sh -Xmx{resources.java_mem}G threads={threads} \
        in1={input[0]} in2={input[1]} \
        outmerged={output[2]} \
        outu={output[0]} outu2={output[1]} \
        {params.flags} k={params.kmer} \
        pigz=t unpigz=t \
        extend2={params.extend2} 2> {log}
    """

SnakeMake From line 231 of rules/assemble.smk

shell:
    "cat {input} > {output}"

SnakeMake From line 273 of rules/assemble.smk

shell:
    """
    rm -r {params.outdir} 2> {log}

    megahit \
    {params.inputs} \
    --tmp-dir {resources.tmpdir} \
    --num-cpu-threads {threads} \
    --k-min {params.k_min} \
    --k-max {params.k_max} \
    --k-step {params.k_step} \
    --out-dir {params.outdir} \
    --out-prefix {wildcards.sample}_prefilter \
    --min-contig-len {params.min_contig_len} \
    --min-count {params.min_count} \
    --merge-level {params.merge_level} \
    --prune-level {params.prune_level} \
    --low-local-ratio {params.low_local_ratio} \
    --memory {resources.mem}000000000  \
    {params.preset} >> {log} 2>&1
    """

SnakeMake MEGAHIT From line 320 of rules/assemble.smk

shell:
    "cp {input} {output}"

SnakeMake From line 350 of rules/assemble.smk

shell:
    "cp {input} {output}"

SnakeMake From line 466 of rules/assemble.smk

script:
    "../scripts/rename_assembly.py"

SnakeMake From line 488 of rules/assemble.smk

wrapper:
    "v1.19.0/bio/minimap2/aligner"

SnakeMake From line 511 of rules/assemble.smk

shell:
    "pileup.sh ref={input.fasta} in={input.bam} "
    " threads={threads} "
    " -Xmx{resources.java_mem}G "
    " covstats={output.covstats} "
    " concise=t "
    " minmapq={params.minmapq} "
    " secondary={params.pileup_secondary} "
    " 2> {log}"

SnakeMake From line 531 of rules/assemble.smk

shell:
    """filterbycoverage.sh in={input.fasta} \
    cov={input.covstats} \
    out={output.fasta} \
    outd={output.removed_names} \
    minc={params.minc} \
    minp={params.minp} \
    minr={params.minr} \
    minl={params.minl} \
    trim={params.trim} \
    -Xmx{resources.java_mem}G 2> {log}"""

SnakeMake From line 562 of rules/assemble.smk

shell:
    "cp {input} {output}"

SnakeMake From line 588 of rules/assemble.smk

run:
    os.symlink(os.path.relpath(input[0], os.path.dirname(output[0])), output[0])

SnakeMake From line 602 of rules/assemble.smk

shell:
    "stats.sh in={input} format=3 out={output} &> {log}"

SnakeMake From line 619 of rules/assemble.smk

wrapper:
    "v1.19.0/bio/minimap2/aligner"

SnakeMake From line 640 of rules/assemble.smk

shell:
    "pileup.sh "
    " ref={input.fasta} "
    " in={input.bam} "
    " threads={threads} "
    " -Xmx{resources.java_mem}G "
    " covstats={output.covstats} "
    " hist={output.covhist} "
    " concise=t "
    " minmapq={params.minmapq} "
    " secondary={params.pileup_secondary} "
    " bincov={output.bincov} "
    " 2> {log} "

SnakeMake From line 669 of rules/assemble.smk

shell:
    "samtools index {input}"

SnakeMake SAMtools From line 694 of rules/assemble.smk

shell:
    """
    prodigal -i {input} -o {output.gff} -d {output.fna} \
        -a {output.faa} -p meta -f gff 2> {log}
    """

SnakeMake prodigal From line 715 of rules/assemble.smk

run:
    header = [
        "gene_id",
        "Contig",
        "Gene_nr",
        "Start",
        "Stop",
        "Strand",
        "Annotation",
    ]
    with open(output.tsv, "w") as tsv:
        tsv.write("\t".join(header) + "\n")
        with open(input.faa) as fin:
            gene_idx = 0
            for line in fin:
                if line[0] == ">":
                    text = line[1:].strip().split(" # ")
                    old_gene_name = text[0]
                    text.remove(old_gene_name)
                    old_gene_name_split = old_gene_name.split("_")
                    gene_nr = old_gene_name_split[-1]
                    contig_nr = old_gene_name_split[-2]
                    sample = "_".join(
                        old_gene_name_split[: len(old_gene_name_split) - 2]
                    )
                    tsv.write(
                        "{gene_id}\t{sample}_{contig_nr}\t{gene_nr}\t{text}\n".format(
                            text="\t".join(text),
                            gene_id=old_gene_name,
                            i=gene_idx,
                            sample=sample,
                            gene_nr=gene_nr,
                            contig_nr=contig_nr,
                        )
                    )
                    gene_idx += 1

SnakeMake From line 731 of rules/assemble.smk

script:
    "../scripts/combine_contig_stats.py"

SnakeMake From line 794 of rules/assemble.smk

script:
    "../report/assembly_report.py"

SnakeMake From line 807 of rules/assemble.smk

shell:
    "pileup.sh "
    " ref={input.fasta} "
    " in={input.bam} "
    " threads={threads} "
    " -Xmx{resources.java_mem}G "
    " covstats={output.covstats} "
    " secondary={params.pileup_secondary} "
    " 2> {log} "

SnakeMake From line 27 of rules/binning.smk

run:
    with open(input[0]) as fi, open(output[0], "w") as fo:
        # header
        next(fi)
        for line in fi:
            toks = line.strip().split("\t")
            print(toks[0], toks[1], sep="\t", file=fo)

SnakeMake From line 48 of rules/binning.smk

run:
    from utils.parsers_bbmap import combine_coverages

    combined_cov, _ = combine_coverages(
        input.covstats, get_alls_samples_of_group(wildcards), "Avg_fold"
    )

    combined_cov.T.to_csv(output[0], sep="\t")

SnakeMake utils From line 66 of rules/binning.smk

shell:
    """
    concoct -c {params.Nexpected_clusters} \
        --coverage_file {input.coverage} \
        --composition_file {input.fasta} \
        --basename {params.basename} \
        --read_length {params.read_length} \
        --length_threshold {params.min_length} \
        --converge_out \
        --iterations {params.niterations}
    """

SnakeMake CONCOCT From line 98 of rules/binning.smk

run:
    with open(input[0]) as fin, open(output[0], "w") as fout:
        for line in fin:
            fout.write(line.replace(",", "\t"))

SnakeMake From line 120 of rules/binning.smk

shell:
    "jgi_summarize_bam_contig_depths "
    " --percentIdentity {params.minid} "
    " --outputDepth {output} "
    " {input.bams} &> {log} "

SnakeMake From line 145 of rules/binning.smk

shell:
    """
    metabat2 -i {input.contigs} \
        --abdFile {input.depth_file} \
        --minContig {params.min_contig_len} \
        --numThreads {threads} \
        --maxEdges {params.sensitivity} \
        --saveCls --noBinOut \
        -o {output} \
        &> {log}
    """

SnakeMake MetaBAT 2 From line 178 of rules/binning.smk

shell:
    """
    mkdir {output[0]} 2> {log}
    run_MaxBin.pl -contig {input.fasta} \
        -abund {input.abund} \
        -out {params.output_prefix} \
        -min_contig_length {params.mcl} \
        -thread {threads} \
        -prob_threshold {params.pt} \
        -max_iteration {params.mi} >> {log}

    mv {params.output_prefix}.summary {output[0]}/.. 2>> {log}
    mv {params.output_prefix}.marker {output[0]}/..  2>> {log}
    mv {params.output_prefix}.marker_of_each_bin.tar.gz {output[0]}/..  2>> {log}
    mv {params.output_prefix}.log {output[0]}/..  2>> {log}

    """

SnakeMake SqueezeMeta From line 211 of rules/binning.smk

run:
    import pandas as pd
    import numpy as np


    d = pd.read_csv(input[0], index_col=0, header=None, sep="\t").squeeze()

    assert (
        type(d) == pd.Series
    ), "expect the input to be a two column file: {}".format(input[0])

    old_cluster_ids = list(d.unique())
    if 0 in old_cluster_ids:
        old_cluster_ids.remove(0)

    map_cluster_ids = dict(
        zip(
            old_cluster_ids,
            utils.gen_names_for_range(
                len(old_cluster_ids),
                prefix="{sample}_{binner}_".format(**wildcards),
            ),
        )
    )

    new_d = d.map(map_cluster_ids)
    new_d.dropna(inplace=True)
    if new_d.shape[0] == 0:
        logger.warning(
            f"No bins detected with binner {wildcards.binner} in sample {wildcards.sample}.\n"
            "I add longest contig to make the pipline continue"
        )

        new_d[f"{wildcards.sample}_0"] = "{sample}_{binner}_1".format(**wildcards)

    new_d.to_csv(output[0], sep="\t", header=False)

SnakeMake Pandas numpy From line 244 of rules/binning.smk

run:
    (bin_ids,) = glob_wildcards(params.file_name)
    print("found {} bins".format(len(bin_ids)))
    with open(output[0], "w") as out_file:
        for binid in bin_ids:
            with open(params.file_name.format(binid=binid)) as bin_file:
                for line in bin_file:
                    if line.startswith(">"):
                        fasta_header = line[1:].strip().split()[0]
                        out_file.write(f"{fasta_header}\t{binid}\n")
            os.remove(params.file_name.format(binid=binid))

SnakeMake From line 294 of rules/binning.smk

script:
    "../scripts/get_fasta_of_bins.py"

SnakeMake From line 317 of rules/binning.smk

shell:
    "cp {input} {output}"

SnakeMake From line 330 of rules/binning.smk

shell:
    " DAS_Tool --outputbasename {params.output_prefix} "
    " --bins {params.scaffolds2bin} "
    " --labels {params.binner_names} "
    " --contigs {input.contigs} "
    " --search_engine diamond "
    " --proteins {input.proteins} "
    " --write_bin_evals "
    " --megabin_penalty {params.megabin_penalty}"
    " --duplicate_penalty {params.duplicate_penalty} "
    " --threads {threads} "
    " --debug "
    " --score_threshold {params.score_threshold} &> {log} "
    " ; mv {params.output_prefix}_DASTool_contig2bin.tsv {output.cluster_attribution} &>> {log}"

SnakeMake Diamond DAS From line 358 of rules/binning.smk

run:
    from utils.genome_stats import get_many_genome_stats

    filenames = list(Path(input[0]).glob("*" + params.extension))

    get_many_genome_stats(filenames, output[0], threads)

SnakeMake utils From line 12 of rules/bin_quality.smk

run:
    try:
        from utils.io import pandas_concat

        pandas_concat(input, output[0])

    except Exception as e:
        import traceback

        with open(log[0], "w") as logfile:
            traceback.print_exc(file=logfile)

        raise e

SnakeMake utils From line 32 of rules/bin_quality.smk

shell:
    " checkm2 predict "
    " --threads {threads} "
    " {params.lowmem} "
    " --force "
    " --allmodels "
    " -x .fasta "
    " --tmpdir {resources.tmpdir} "
    " --input {input.fasta_dir} "
    " --output-directory {params.dir} "
    " &> {log[0]} "
    ";\n"
    " cp {params.dir}/quality_report.tsv {output.table} 2>> {log[0]} ; "
    " mv {params.dir}/protein_files {output.faa} 2>> {log[0]} ; "

SnakeMake checkm2 From line 71 of rules/bin_quality.smk

shell:
    " mkdir {output.folder} 2> {log}"
    " ;\n"
    " gunc run "
    " --threads {threads} "
    " --gene_calls "
    " --db_file {input.db} "
    " --input_dir {input.fasta_dir} "
    " --temp_dir {resources.tmpdir} "
    " --file_suffix {params.extension} "
    " --out_dir {output.folder} &>> {log} "
    " ;\n "
    " cp {output.folder}/*.tsv {output.table} 2>> {log}"

SnakeMake gunc From line 107 of rules/bin_quality.smk

    shell:
        " busco -i {input.fasta_dir} "
        " --auto-lineage-prok "
        " -m genome "
        " --out_path {params.tmpdir} "
        " -o output "
        " --download_path {input.db} "
        " -c {threads} "
        " --offline &> {log} "
        " ; "
        " mv {params.tmpdir}/output/batch_summary.txt {output} 2>> {log}"

"""

SnakeMake From line 145 of rules/bin_quality.smk

run:
    try:
        from utils.io import pandas_concat

        pandas_concat(input, output[0])

    except Exception as e:
        import traceback

        with open(log[0], "w") as logfile:
            traceback.print_exc(file=logfile)

        raise e

SnakeMake utils From line 180 of rules/bin_quality.smk

script:
    "../scripts/combine_checkm2.py"

SnakeMake From line 207 of rules/bin_quality.smk

run:
    import pandas as pd
    from pathlib import Path
    from utils import io


    def get_list_of_files(dirs, pattern):
        fasta_files = []

        # searh for fasta files (.f*) in all bin folders
        for dir in dirs:
            dir = Path(dir)
            fasta_files += list(dir.glob(pattern))

        filenames = pd.DataFrame(fasta_files, columns=["Filename"])
        filenames.index = filenames.Filename.apply(io.simplify_path)
        filenames.index.name = "Bin"

        filenames.sort_index(inplace=True)

        return filenames


    fasta_filenames = get_list_of_files(input.dirs, "*.f*")
    faa_filenames = get_list_of_files(input.protein_dirs, "*.faa")

    assert all(
        faa_filenames.index == fasta_filenames.index
    ), "faa index and faa index are nt the same"

    faa_filenames.columns = ["Proteins"]

    filenames = pd.concat((fasta_filenames, faa_filenames), axis=1)

    filenames.to_csv(output.filenames, sep="\t")

SnakeMake Pandas utils From line 227 of rules/bin_quality.smk

run:
    from utils.io import cat_files

    cat_files(input, output[0], gzip=True)

SnakeMake utils From line 276 of rules/bin_quality.smk

script:
    "../scripts/filter_genomes.py"

SnakeMake From line 318 of rules/bin_quality.smk

shell:
    """
    cd-hit-est -i {input} -T {threads} \
    -M {resources.mem}000 -o {params.prefix} \
    -c {params.identity} -n 9  -d 0 {params.extra} \
    -aS {params.coverage} -aL {params.coverage} &> {log}

    mv {params.prefix} {output[0]} 2>> {log}
    """

SnakeMake cd-hit From line 74 of rules/cdhit.smk

run:
    with open(output[0], "w") as fout:
        fout.write(f"ORF\tLength\tIdentity\tRepresentative\n")
        Clusters = parse_cd_hit_file(input[0])
        write_cd_hit_clusters(Clusters, fout)

SnakeMake From line 90 of rules/cdhit.smk

run:
    import pandas as pd
    import numpy as np

    from utils import gene_scripts

    # cd hit format ORF\tLength\tIdentity\tRepresentative\n
    orf2gene = pd.read_csv(input.orf2gene, sep="\t")

    # rename gene repr to Gene0000XX

    # split orf names in sample, contig_nr, and orf_nr
    orf_info = gene_scripts.split_orf_to_index(orf2gene.ORF)

    # rename representative

    representative_names = orf2gene.Representative.unique()

    map_names = pd.Series(
        index=representative_names,
        data=np.arange(1, len(representative_names) + 1, dtype=np.uint),
    )


    orf_info["GeneNr"] = orf2gene.Representative.map(map_names)


    orf_info.to_parquet(output.cluster_attribution)


    # Save name of representatives
    map_names.index.name = "Representative"
    map_names.name = "GeneNr"
    map_names.to_csv(output.rep2genenr, sep="\t")

SnakeMake Pandas numpy utils From line 104 of rules/cdhit.smk

shell:
    " reformat.sh in={input} "
    " fastaminlen={params.min_length} "
    " out={output} "
    " overwrite=true "
    " -Xmx{resources.java_mem}G 2> {log} "

SnakeMake From line 24 of rules/cobinning.smk

run:
    import gzip as gz

    with gz.open(output[0], "wb") as fout:
        for sample, input_fasta in zip(params.samples, input.fasta):
            with gz.open(input_fasta, "rb") as fin:
                for line in fin:
                    # if line is a header add sample name
                    if line[0] == ord('>'):
                        line = f">{sample}{params.seperator}".encode() + line[1:]
                    # write each line to the combined file
                    fout.write(line)

SnakeMake From line 75 of rules/cobinning.smk

shell:
    "minimap2 -I {params.index_size} -t {threads} -d {output} {input} 2> {log}"

SnakeMake Minimap2 From line 105 of rules/cobinning.smk

shell:
    "samtools dict {input} | cut -f1-3 > {output} 2> {log}"

SnakeMake SAMtools From line 122 of rules/cobinning.smk

shell:
    """minimap2 -t {threads} -ax sr {input.mmi} {input.fq} | grep -v "^@" | cat {input.dict} - | samtools view -F 3584 -b - > {output.bam} 2>{log}"""

SnakeMake SAMtools Minimap2 From line 142 of rules/cobinning.smk

shell:
    "samtools sort {input} -T {params.prefix} --threads {threads} -m 3G -o {output} 2>{log}"

SnakeMake SAMtools From line 165 of rules/cobinning.smk

shell:
    "jgi_summarize_bam_contig_depths "
    " --percentIdentity {params.minid} "
    " --outputDepth {output} "
    " {input.bams} &> {log} "

SnakeMake From line 186 of rules/cobinning.smk

script:
    "../scripts/convert_jgi2vamb_coverage.py"

SnakeMake From line 207 of rules/cobinning.smk

shell:
    "vamb --outdir {output} "
    " -m {params.mincontig} "
    " --minfasta {params.minfasta} "
    " -o '{params.separator}' "
    " --jgi {input.coverage} "
    " --fasta {input.fasta} "
    "2> {log}"

SnakeMake VAMB From line 231 of rules/cobinning.smk

script:
    "../scripts/parse_vamb.py"

SnakeMake From line 264 of rules/cobinning.smk

shell:
    "skani triangle "
    " {params.extra} "
    " -l {input.paths} "
    " -o {output} "
    " -t {threads} "
    " --sparse --ci "
    " --min-af {params.min_af} "
    " &> {log} "

SnakeMake From line 21 of rules/derep.smk

run:
    try:

        skani_column_dtypes = {
            "Ref_file": "category",
            "Query_file": "category",
            "ANI": float,
            "Align_fraction_ref": float,
            "Align_fraction_query": float,
            "ANI_5_percentile": float,
            "ANI_95_percentile": float,
        }  # Ref_name        Query_name

        import pandas as pd

        import pandas as pd

        df = pd.read_table(input[0])

        from utils.io import simplify_path

        df = pd.read_table(
            input[0],
            usecols=list(skani_column_dtypes.keys()),
            dtype=skani_column_dtypes,
        )

        df["Ref"] = df.Ref_file.cat.rename_categories(simplify_path)
        df["Query"] = df.Query_file.cat.rename_categories(simplify_path)

        df.to_parquet(output[0])

    except Exception as e:
        import traceback

        with open(log[0], "w") as logfile:
            traceback.print_exc(file=logfile)

        raise e

SnakeMake Pandas utils From line 43 of rules/derep.smk

script:
    "../scripts/cluster_species.py"

SnakeMake From line 99 of rules/derep.smk

script:
    "../report/bin_report.py"

SnakeMake From line 113 of rules/derep.smk

run:
    shell(
        "wget -O {output} 'https://zenodo.org/record/{ZENODO_ARCHIVE}/files/{wildcards.filename}' "
    )
    if not FILES[wildcards.filename] == md5(output[0]):
        raise OSError(2, "Invalid checksum", output[0])

SnakeMake From line 134 of rules/download.smk

run:
    shell(
        "wget -O {output.tar} 'https://zenodo.org/record/{ZENODO_ARCHIVE}/files/{CHECKM_ARCHIVE}' "
    )
    if not FILES[CHECKM_ARCHIVE] == md5(output.tar):
        raise OSError(2, "Invalid checksum", CHECKM_ARCHIVE)

    shell("tar -zxf {output.tar} --directory {params.path}")

SnakeMake From line 148 of rules/download.smk

shell:
    "checkm data setRoot {params.database_dir} &> {log} "

SnakeMake CheckM From line 173 of rules/download.smk

shell:
    " wget --no-check-certificate {GTDB_DATA_URL} -O {output} &> {log} "

SnakeMake From line 191 of rules/download.smk

shell:
    'tar -xzvf {input} -C "{GTDBTK_DATA_PATH}" --strip 1 2> {log}; '

SnakeMake From line 207 of rules/download.smk

shell:
    " checkm2 database --download --path {output} "
    " &>> {log}"

SnakeMake checkm2 From line 221 of rules/download.smk

shell:
    "gunc download_db {resources.tmpdir} -db {wildcards.gunc_database} &> {log} ;"
    "mv {resources.tmpdir}/gunc_db_{wildcards.gunc_database}*.dmnd {output} 2>> {log}"

SnakeMake gunc From line 238 of rules/download.smk

shell:
    "busco -q --download_path {output} --download prokaryota &> {log}"

SnakeMake From line 254 of rules/download.smk

shell:
    " DRAM-setup.py prepare_databases "
    " --output_dir {output.dbdir} "
    " --threads {threads} "
    " --verbose "
    " --skip_uniref "
    " &> {log} "
    " ; "
    " DRAM-setup.py export_config --output_file {output.config}"

SnakeMake From line 33 of rules/dram.smk

shell:
    " DRAM.py annotate "
    " --config_loc {input.config} "
    " --input_fasta {input.fasta}"
    " --output_dir {output.outdir} "
    " --threads {threads} "
    " --min_contig_size {params.min_contig_size} "
    " {params.extra} "
    " --verbose &> {log}"

SnakeMake From line 65 of rules/dram.smk

run:
    from utils import io

    for i, annotation_file in enumerate(DRAM_ANNOTATON_FILES):
        input_files = [
            os.path.join(dram_folder, annotation_file) for dram_folder in input
        ]

        io.pandas_concat(
            input_files, output[i], sep="\t", index_col=0, axis=0, disk_based=True
        )

SnakeMake utils From line 94 of rules/dram.smk

shell:
    " DRAM.py distill "
    " --config_loc {input.config} "
    " --input_file {input[0]}"
    " --output_dir {output} "
    "  &> {log}"

SnakeMake From line 121 of rules/dram.smk

script:
    "../scripts/DRAM_get_all_modules.py"

SnakeMake From line 143 of rules/dram.smk

script:
    "../scripts/filter_genes.py"

SnakeMake From line 19 of rules/genecatalog.smk

run:
    from utils.io import cat_files

    cat_files(input.faa, output.faa)
    cat_files(input.fna, output.fna)
    cat_files(input.short, output.short)

SnakeMake utils From line 46 of rules/genecatalog.smk

run:
    from utils.io import cat_files

    cat_files(input.faa, output.faa)
    cat_files(input.fna, output.fna)
    cat_files(input.short, output.short)

SnakeMake utils From line 69 of rules/genecatalog.smk

shell:
    """
    mkdir -p {params.tmpdir} {output} 2>> {log}
    mmseqs createdb {input.faa} {params.db} &> {log}

    mmseqs {params.clustermethod} -c {params.coverage} \
    --min-seq-id {params.minid} {params.extra} \
    --threads {threads} {params.db} {params.clusterdb} {params.tmpdir}  &>>  {log}

    rm -fr  {params.tmpdir} 2>> {log}
    """

SnakeMake MMseqs From line 104 of rules/genecatalog.smk

shell:
    """
    mmseqs createtsv {params.db} {params.db} {params.clusterdb} {output.cluster_attribution}  &> {log}

    mkdir {output.rep_seqs_db} 2>> {log}

    mmseqs result2repseq {params.db} {params.clusterdb} {output.rep_seqs_db}/db  &>> {log}

    mmseqs result2flat {params.db} {params.db} {output.rep_seqs_db}/db {output.rep_seqs}  &>> {log}

    """

SnakeMake MMseqs From line 132 of rules/genecatalog.smk

shell:
    " filterbyname.sh "
    " in={input.all}"
    " names={input.names}"
    " include=t"
    " out={output} "
    " -Xmx{resources.java_mem}G "
    " 2> {log}"

SnakeMake From line 158 of rules/genecatalog.smk

script:
    "../scripts/generate_orf_info.py"

SnakeMake From line 176 of rules/genecatalog.smk

script:
    "../scripts/rename_genecatalog.py"

SnakeMake From line 207 of rules/genecatalog.smk

shell:
    "stats.sh gcformat=4 gc={output} in={input} &> {log}"

SnakeMake From line 224 of rules/genecatalog.smk

wrapper:
    "v1.19.0/bio/minimap2/index"

SnakeMake From line 237 of rules/genecatalog.smk

shell:
    "cat {input} > {output} 2> {log}"

SnakeMake From line 251 of rules/genecatalog.smk

wrapper:
    "v1.19.0/bio/minimap2/aligner"

SnakeMake From line 269 of rules/genecatalog.smk

shell:
    " pileup.sh "
    " in={input.bam}"
    " covstats={output.covstats} "
    " rpkm={output.rpkm} "
    " secondary=t "
    " minmapq={params.minmapq} "
    " -Xmx{resources.java_mem}G "
    " 2> {log} "

SnakeMake From line 289 of rules/genecatalog.smk

run:
    try:
        import pandas as pd
        from utils.parsers_bbmap import read_pileup_coverage

        data = read_pileup_coverage(
            input[0],
            coverage_measure="Median_fold",
            other_columns=["Avg_fold", "Covered_percent", "Read_GC", "Std_Dev"],
        )
        data.index.name = "GeneName"
        data.sort_index(inplace=True)

        # rpkm = pd.read_csv(input[1],sep='\t',skiprows=4,usecols=["#Name","RPKM"],index_col=0).sort_index()

        data.reset_index().to_parquet(output[0])

    except Exception as e:
        import traceback

        with open(log[0], "w") as logfile:
            traceback.print_exc(file=logfile)

        raise e

SnakeMake Pandas utils From line 312 of rules/genecatalog.smk

script:
    "../scripts/combine_gene_coverages.py"

SnakeMake From line 374 of rules/genecatalog.smk

script:
    "../scripts/split_genecatalog.py"

SnakeMake From line 414 of rules/genecatalog.smk

shell:
    """
    emapper.py -m diamond --no_annot --no_file_comments \
        --data_dir {params.data_dir} --cpu {threads} -i {input.faa} \
        -o {params.prefix} --override 2> {log}
    """

SnakeMake Diamond eggNOG-mapper v2 From line 455 of rules/genecatalog.smk

shell:
    """

    if [ {params.copyto_shm} == "t" ] ;
    then
        cp {EGGNOG_DIR}/eggnog.db {params.data_dir}/eggnog.db 2> {log}
        cp {EGGNOG_DIR}/eggnog_proteins.dmnd {params.data_dir}/eggnog_proteins.dmnd 2>> {log}
    fi

    emapper.py --annotate_hits_table {input.seed} --no_file_comments \
      --override -o {params.prefix} --cpu {threads} --data_dir {params.data_dir} 2>> {log}
    """

SnakeMake eggNOG-mapper v2 From line 490 of rules/genecatalog.smk

run:
    try:

        import pandas as pd

        Tables = [
            pd.read_csv(file, index_col=None, header=None, sep="\t")
            for file in input
        ]

        combined = pd.concat(Tables, axis=0)

        del Tables

        combined.columns = EGGNOG_HEADER
        combined["Seed_evalue"] = combined["Seed_evalue"].astype("bytes")
        combined["Seed_Score"] = combined["Seed_Score"].astype("bytes")

        #           combined.sort_values("Gene",inplace=True)

        combined.to_parquet(output[0], index=False)
    except Exception as e:

        import traceback

        with open(log[0], "w") as logfile:
            traceback.print_exc(file=logfile)

        raise e

SnakeMake Pandas From line 519 of rules/genecatalog.smk

run:
    try:
        import pandas as pd

        df = pd.read_table(input[0], index_col=None)

        df.to_parquet(output[0], index=False)

    except Exception as e:

        import traceback

        with open(log[0], "w") as logfile:
            traceback.print_exc(file=logfile)

        raise e

SnakeMake Pandas From line 560 of rules/genecatalog.smk

shell:
    " rm -rf {params.outdir} &> {log[0]};"
    "\n"
    " DRAM.py annotate_genes "
    " --input_faa {input.faa}"
    " --config_loc {input.config} "
    " --output_dir {params.outdir} "
    " --threads {threads} "
    " {params.extra} "
    " --log_file_path {log[1]} "
    " --verbose &>> {log[0]}"

SnakeMake From line 607 of rules/genecatalog.smk

script:
    "../scripts/combine_dram_gene_annotations.py"

SnakeMake From line 638 of rules/genecatalog.smk

script:
    "../scripts/gene2genome.py"

SnakeMake From line 657 of rules/genecatalog.smk

shell:
    " DIR=$(dirname $(readlink -f $(which DAS_Tool))) "
    ";"
    " ruby {params.script_dir}/rules/scg_blank_diamond.rb diamond"
    " {input} "
    " $DIR\/db/{params.key}.all.faa "
    " $DIR\/db/{params.key}.scg.faa "
    " $DIR\/db/{params.key}.scg.lookup "
    " {threads} "
    " 2> {log} "
    " ; "
    " mv {input[0]}.scg {output}"

SnakeMake Diamond BioRuby From line 718 of rules/genecatalog.smk

script:
    "../scripts/rename_genomes.py"

SnakeMake From line 28 of rules/genomes.smk

run:
    from glob import glob

    fasta_files = glob(input[0] + "/*.f*")

    with open(output[0], "w") as out_contigs:
        for fasta in fasta_files:
            bin_name, ext = os.path.splitext(os.path.split(fasta)[-1])
            # if gz remove also fasta extension
            if ext == ".gz":
                bin_name = os.path.splitext(bin_name)[0]

            # write names of contigs in mapping file
            with open(fasta) as f:
                for line in f:
                    if line[0] == ">":
                        header = line[1:].strip().split()[0]
                        out_contigs.write(f"{header}\t{bin_name}\n")

SnakeMake From line 81 of rules/genomes.smk

shell:
    """
    prodigal -i {input} -o {output.gff} -d {output.fna} \
        -a {output.faa} -p meta -f gff 2> {log}
    """

SnakeMake prodigal From line 139 of rules/genomes.smk

shell:
    "cat {input} > {output}"

SnakeMake From line 178 of rules/genomes.smk

shell:
    "cat {input}/*{params.ext} > {output}"

SnakeMake From line 189 of rules/genomes.smk

wrapper:
    "v1.19.0/bio/minimap2/index"

SnakeMake From line 208 of rules/genomes.smk

wrapper:
    "v1.19.0/bio/minimap2/aligner"

SnakeMake From line 226 of rules/genomes.smk

wrapper:
    "v1.19.0/bio/bwa-mem2/index"

SnakeMake From line 243 of rules/genomes.smk

wrapper:
    "v1.19.0/bio/bwa-mem2/mem"

SnakeMake From line 262 of rules/genomes.smk

shell:
    "mv {input} {output} > {log}"

SnakeMake From line 289 of rules/genomes.smk

wrapper:
    "v1.19.0/bio/samtools/stats"

SnakeMake From line 303 of rules/genomes.smk

wrapper:
    "v1.19.1/bio/multiqc"

SnakeMake MultiQC From line 314 of rules/genomes.smk

shell:
    "pileup.sh in={input.bam} "
    " threads={threads} "
    " -Xmx{resources.java_mem}G "
    " covstats={output.covstats} "
    " fastaorf={input.orf} outorf={output.orf} "
    " concise=t "
    " physical=t "
    " minmapq={params.minmapq} "
    " bincov={output.bincov} "
    " 2> {log}"

SnakeMake From line 336 of rules/genomes.smk

script:
    "../scripts/combine_coverage_MAGs.py"

SnakeMake From line 371 of rules/genomes.smk

shell:
    'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
    "gtdbtk identify "
    "--genes --genome_dir {params.gene_dir} "
    " --out_dir {params.outdir} "
    "--extension {params.extension} "
    "--cpus {threads} &> {log[0]}"

SnakeMake gtdbtk From line 20 of rules/gtdbtk.smk

shell:
    'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
    "gtdbtk align --identify_dir {params.outdir} --out_dir {params.outdir} "
    "--cpus {threads} &> {log[0]}"

SnakeMake gtdbtk From line 42 of rules/gtdbtk.smk

shell:
    'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
    "gtdbtk classify --genome_dir {input.genome_dir} --align_dir {params.outdir} "
    " --mash_db {params.mashdir} "
    "--out_dir {params.outdir} "
    " --tmpdir {resources.tmpdir} "
    "--extension {params.extension} "
    "--cpus {threads} &> {log[0]}"

SnakeMake gtdbtk From line 67 of rules/gtdbtk.smk

script:
    "../scripts/combine_taxonomy.py"

SnakeMake From line 85 of rules/gtdbtk.smk

shell:
    'export GTDBTK_DATA_PATH="{GTDBTK_DATA_PATH}" ; '
    "gtdbtk infer --msa_file {input} "
    " --out_dir {params.outdir} "
    " --prefix {wildcards.msa} "
    " --cpus {threads} "
    "--tmpdir {resources.tmpdir} > {log[0]} 2> {log[1]}"

SnakeMake gtdbtk From line 102 of rules/gtdbtk.smk

script:
    "../scripts/root_tree.py"

SnakeMake From line 130 of rules/gtdbtk.smk

shell:
    "reformat.sh "
    " {params.inputs} "
    " interleaved={params.interleaved} "
    " {params.outputs} "
    " {params.extra} "
    " overwrite=true "
    " verifypaired={params.verifypaired} "
    " threads={threads} "
    " -Xmx{resources.java_mem}G "
    " 2> {log}"

SnakeMake From line 92 of rules/qc.smk

script:
    "../scripts/get_read_stats.py"

SnakeMake From line 130 of rules/qc.smk

shell:
    "clumpify.sh "
    " {params.inputs} "
    " {params.outputs} "
    " overwrite=true"
    " dedupe=t "
    " dupesubs={params.dupesubs} "
    " optical={params.only_optical}"
    " threads={threads} "
    " pigz=t unpigz=t "
    " -Xmx{resources.java_mem}G "
    " 2> {log}"

SnakeMake From line 174 of rules/qc.smk

shell:
    " bbduk.sh {params.inputs} "
    " {params.ref} "
    " interleaved={params.interleaved} "
    " {params.outputs} "
    " stats={output.stats} "
    " overwrite=true "
    " qout=33 "
    " trd=t "
    " {params.hdist} "
    " {params.k} "
    " {params.ktrim} "
    " {params.mink} "
    " trimq={params.trimq} "
    " qtrim={params.qtrim} "
    " threads={threads} "
    " minlength={params.minlength} "
    " maxns={params.maxns} "
    " minbasefrequency={params.minbasefrequency} "
    " ecco={params.error_correction_pe} "
    " prealloc={params.prealloc} "
    " pigz=t unpigz=t "
    " -Xmx{resources.java_mem}G "
    " 2> {log}"

SnakeMake BBMap From line 280 of rules/qc.smk

shell:
    "bbsplit.sh"
    " -Xmx{resources.java_mem}G "
    " {params.refs_in} "
    " threads={threads}"
    " k={params.k}"
    " local=t "
    " 2> {log}"

SnakeMake From line 331 of rules/qc.smk

shell:
    """
    if [ "{params.paired}" = true ] ; then
        bbsplit.sh in1={input[0]} in2={input[1]} \
            outu1={output[0]} outu2={output[1]} \
            basename="{params.contaminant_folder}/%_R#.fastq.gz" \
            maxindel={params.maxindel} minratio={params.minratio} \
            minhits={params.minhits} ambiguous={params.ambiguous} refstats={output.stats} \
            threads={threads} k={params.k} local=t \
            pigz=t unpigz=t ziplevel=9 \
            -Xmx{resources.java_mem}G 2> {log}
    fi

    bbsplit.sh in={params.input_single}  \
        outu={params.output_single} \
        basename="{params.contaminant_folder}/%_se.fastq.gz" \
        maxindel={params.maxindel} minratio={params.minratio} \
        minhits={params.minhits} ambiguous={params.ambiguous} refstats={output.stats} append=t \
        interleaved=f threads={threads} k={params.k} local=t \
        pigz=t unpigz=t ziplevel=9 \
        -Xmx{resources.java_mem}G 2>> {log}
    """

SnakeMake From line 386 of rules/qc.smk

run:
    import shutil
    import pandas as pd

    for i in range(len(MULTIFILE_FRACTIONS)):
        with open(output[i], "wb") as outFile:
            with open(input.clean_reads[i], "rb") as infile1:
                shutil.copyfileobj(infile1, outFile)
                if hasattr(input, "rrna_reads"):
                    with open(input.rrna_reads[i], "rb") as infile2:
                        shutil.copyfileobj(infile2, outFile)

    # append to sample table
    sample_table = load_sample_table(params.sample_table)
    qc_header = [f"Reads_QC_{fraction}" for fraction in MULTIFILE_FRACTIONS]
    sample_table.loc[wildcards.sample, qc_header] = output
    sample_table.to_csv(params.sample_table, sep="\t")

SnakeMake Pandas From line 427 of rules/qc.smk

shell:
    " bbmerge.sh "
    " -Xmx{resources.java_mem}G "
    " threads={threads} "
    " {params.inputs} "
    " {params.flags} k={params.kmer} "
    " extend2={params.extend2} "
    " ihist={output.ihist} merge=f "
    " mininsert0=35 minoverlap0=8 "
    " prealloc=t prefilter=t "
    " minprob={params.minprob} 2> {log} \n  "
    """
    readlength.sh {params.inputs} out={output.read_length} 2>> {log}
    """

SnakeMake From line 477 of rules/qc.smk

shell:
    """
    readlength.sh in={input[0]} out={output.read_length} 2> {log}
    """

SnakeMake From line 511 of rules/qc.smk

run:
    import pandas as pd
    import os
    from utils.parsers_bbmap import parse_comments

    stats = pd.DataFrame()

    for length_file in input:
        sample = length_file.split(os.path.sep)[0]
        data = parse_comments(length_file)
        data = pd.Series(data)[
            ["Reads", "Bases", "Max", "Min", "Avg", "Median", "Mode", "Std_Dev"]
        ]
        stats[sample] = data

    stats.to_csv(output[0], sep="\t")

SnakeMake Pandas utils From line 525 of rules/qc.smk

run:
    import pandas as pd
    import os
    from utils.parsers_bbmap import parse_comments

    stats = pd.DataFrame()

    for insert_file in input:
        sample = insert_file.split(os.path.sep)[0]
        data = parse_comments(insert_file)
        data = pd.Series(data)[
            ["Mean", "Median", "Mode", "STDev", "PercentOfPairs"]
        ]
        stats[sample] = data

    stats.T.to_csv(output[0], sep="\t")

SnakeMake Pandas utils From line 579 of rules/qc.smk

run:
    from utils.io import pandas_concat

    pandas_concat(
        list(input.read_count_files),
        output.read_stats,
        sep="\t",
        index_col=[0, 1],
        axis=0,
    )

SnakeMake utils From line 612 of rules/qc.smk

run:
    from utils.io import pandas_concat

    pandas_concat(list(input), output[0], sep="\t", index_col=[0, 1], axis=0)

SnakeMake utils From line 632 of rules/qc.smk

script:
    "../report/qc_report.py"

SnakeMake From line 673 of rules/qc.smk

shell:
    "bbsketch.sh "
    "in={input[0]}" # take only one read
    " samplerate=0.5"
    " minkeycount=2 "
    " out={output} "
    " blacklist=nt ssu=f name0={wildcards.sample} depth=t overwrite=t "
    " -Xmx{resources.java_mem}g "
    " &> {log}"

SnakeMake From line 15 of rules/screen.smk

shell:
    "comparesketch.sh alltoall "
    " format=3 out={output} "
    " records=5000 "
    " {input} "
    " -Xmx{resources.java_mem}g "
    " &> {log}"

SnakeMake From line 40 of rules/screen.smk

shell:
    "SemiBin generate_sequence_features_multi"
    " --input-fasta {input.fasta} "
    " --input-bam {input.bams} "
    " --output {output} "
    " --threads {threads} "
    " --separator {params.separator} "
    " 2> {log}"

SnakeMake SemiBin From line 26 of rules/semibin.smk

shell:
    "SemiBin train_self "
    " --output {params.output_dir} "
    " --threads {threads} "
    " --data {params.data} "
    " --data-split {params.data_split} "
    " {params.extra} "
    " 2> {log}"

SnakeMake SemiBin From line 65 of rules/semibin.smk

shell:
    "SemiBin bin "
    " --input-fasta {input.fasta} "
    " --output {params.output_dir} "
    " --threads {threads} "
    " --data {params.data} "
    " --model {input.model} "
    " --minfasta-kbs {params.min_bin_kbs}"
    " {params.extra} "
    " 2> {log}"

SnakeMake SemiBin From line 128 of rules/semibin.smk

script:
    "../scripts/parse_semibin.py"

SnakeMake From line 158 of rules/semibin.smk

shell:
    " mkdir -p {params.outdir} 2> {log} "
    " ; "
    " prefetch "
    " --output-directory {params.outdir} "
    " -X 999999999 "
    " --progress "
    " --log-level info "
    " {wildcards.sra_run} &>> {log} "
    " ; "
    " vdb-validate {params.outdir}/{wildcards.sra_run}/{wildcards.sra_run}.sra &>> {log} "

SnakeMake From line 30 of rules/sra.smk

shell:
    " vdb-validate {params.sra_file} &>> {log} "
    " ; "
    " parallel-fastq-dump "
    " --threads {threads} "
    " --gzip --split-files "
    " --outdir {params.outdir} "
    " --tmpdir {resources.tmpdir} "
    " --skip-technical --split-3 "
    " -s {params.sra_file} &>> {log} "
    " ; "
    " rm -f {params.sra_file} 2>> {log} "

SnakeMake From line 66 of rules/sra.smk

run:
    from utils import io

    for i, fraction in enumerate(SRA_read_fractions):
        if fraction == "":
            fraction = "se"
        io.cat_files(input[fraction], output[i])

SnakeMake utils From line 123 of rules/sra.smk

shell:
    "inStrain compare "
    " --input {input.profiles} "
    " -o {output} "
    " -p {threads} "
    " -s {input.scaffold_to_genome} "
    " --database_mode "
    " {params.extra} &> {log}"

SnakeMake From line 56 of rules/strains.smk

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.DEBUG,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.captureWarnings(True)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts


import pandas as pd

import numpy as np
from utils import genome_dist as gd
import networkx as nx


def get_float(value):
    "Enshure that value is [0-1]"

    assert value >= 0

    if value > 1:
        assert value <= 100, "it should be a percentage"
        logging.debug(f"Value {value} is > 1, I divede it with 100 to get a float")

        return value / 100
    else:
        return value


linkage_method = snakemake.params.linkage_method
pre_cluster_threshold = get_float(snakemake.params.pre_cluster_threshold)
threshold = get_float(snakemake.params.threshold)
min_aligned_fraction = get_float(snakemake.config["genome_dereplication"]["overlap"])

# verify ranges
gd.verify_expected_range(pre_cluster_threshold, 0.8, 1, "pre_cluster_threshold")
gd.verify_expected_range(threshold, 0.8, 1, "ANI cluster threshold")
gd.verify_expected_range(min_aligned_fraction, 0.1, 0.95, "min_aligned_fraction")


# load quality
Q = pd.read_csv(snakemake.input.bin_info, sep="\t", index_col=0)
Q.Additional_Notes = Q.Additional_Notes.fillna("").astype(str)


logging.info("Load distances")
M = gd.load_skani(snakemake.input.dist)

# genome distance to graph
pre_clustering_criteria = (
    f"ANI >= {pre_cluster_threshold} & Align_fraction > {min_aligned_fraction}"
)

logging.info(f"Pre-cluster genomes with the if '{pre_clustering_criteria}'")
G = gd.to_graph(M.query(pre_clustering_criteria))

if hasattr(G, "selfloop_edges"):
    G.remove_edges_from(G.selfloop_edges())


# prepare table for species number
mag2Species = pd.DataFrame(index=Q.index, columns=["SpeciesNr", "Species"])
mag2Species.index.name = "genome"
genomes_to_drop = []

last_species_nr = 1  # start at 1


n_pre_clusters = nx.connected.number_connected_components(G)
logging.info(f"Found {n_pre_clusters} pre-clusters, itterate over them.")
logging.debug(f"Cluster with threshold {threshold} and {linkage_method}-linkage method")
for i, cc in enumerate(nx.connected_components(G)):
    logging.info(f"Precluster {i+1}/{n_pre_clusters} with {len(cc)} genomes")

    Qcc = Q.loc[list(cc)]

    # check translation table
    freq = Qcc["Translation_Table_Used"].value_counts()

    if freq.shape[0] > 1:
        logging.info(
            "Not all genomes use the same translation table,"
            "drop genomes that don't use main translation table."
        )
        logging.info(freq)

        main_tranlation_table = freq.index[0]

        drop_genomes = Qcc.query(
            "Translation_Table_Used != @main_tranlation_table"
        ).index

        cc = cc - set(drop_genomes)
        Qcc = Qcc.loc[list(cc)]
        genomes_to_drop += list(drop_genomes)
        logging.info(f"Drop {len(drop_genomes) } genomes, keep ({len(cc)})")

    ## Check that the same completeness model is used for all

    freq = Qcc["Completeness_Model_Used"].value_counts()
    if freq.shape[0] > 1:
        logging.info(
            "Not all genomes use the same completeness model. Recalibrate completeness."
        )

        logging.info(freq)

        # genomes that don't use specific model
        non_specific = Qcc.index[
            ~Qcc.Completeness_Model_Used.str.contains("Specific Model")
        ]

        logging.debug(
            f"{len(non_specific)} genomes use general completeness model. Recalibrate completeness and quality score to use lower value"
        )

        logging.debug(
            Qcc.loc[
                non_specific,
                ["Completeness_General", "Completeness_Specific", "Contamination"],
            ]
        )

        Qcc.loc[non_specific, "Completeness"] = Qcc.loc[
            non_specific,
            [
                "Completeness_General",
                "Completeness_Specific",
            ],
        ].min(axis=1)

        # add note
        Q.loc[
            non_specific, "Additional_Notes"
        ] += "Completeness was re-calibrated based on Completeness model used in all genomes of the species."

        # transfer to main quality
        Q.loc[list(cc), "Completeness"] = Qcc.loc[list(cc), "Completeness"]

        # drop low quality genomes

        logging.info("Drop low quality genomes acording to filtercriteria")

        try:
            filter_criteria = snakemake.config["genome_filter_criteria"]
            drop_genomes = Qcc.index.difference(Qcc.query(filter_criteria).index)

        except Exception as e:
            logging.error("Cannot filter low quality genomes")
            logging.exception(e)

            drop_genomes = []

        if len(drop_genomes) > 0:
            cc = cc - set(drop_genomes)
            logging.info(
                f"Drop {len(drop_genomes) } with too low quality genomes, keep {len(cc)}"
            )

            Qcc = Qcc.loc[list(cc)]
            genomes_to_drop += list(drop_genomes)

    if len(cc) <= 1:
        logging.info(
            "I am left with {len(cc)} genomes in this pre-cluster. No need to cluster."
        )
    else:
        # subset dist matrix
        Mcc = M.loc[
            (M.index.levels[0].intersection(cc), M.index.levels[1].intersection(cc)),
        ]

        # Cluster species
        labels = gd.group_species_linkage(
            Mcc.ANI, threshold=threshold, linkage_method=linkage_method
        )

        logging.debug(f"Got {labels.max()} species cluster for this pre-cluster.")

        # enter values of labels to species table
        mag2Species.loc[labels.index, "SpeciesNr"] = labels + last_species_nr
        last_species_nr = mag2Species.SpeciesNr.max()


mag2Species.drop(genomes_to_drop, inplace=True)
Q.drop(genomes_to_drop, inplace=True)


missing_species = mag2Species.index[mag2Species.SpeciesNr.isnull()]
N_missing_species = len(missing_species)

logging.info(
    f"{N_missing_species} genomes were not part of a pre-cluster and are singleton-species."
)

Q.loc[missing_species, "Additional_Notes"] += " Singleton species"

mag2Species.loc[missing_species, "SpeciesNr"] = (
    np.arange(last_species_nr, last_species_nr + N_missing_species) + 1
)


n_species = mag2Species.SpeciesNr.unique().shape[0]
logging.info(f"Identified {n_species } species in total")

# create propper species names
n_leading_zeros = len(str(mag2Species.SpeciesNr.max()))
format_int = "sp{:0" + str(n_leading_zeros) + "d}"
mag2Species["Species"] = mag2Species.SpeciesNr.apply(format_int.format)


# calculate quality score


logging.info("Define Quality score defined as Completeness - 5x Contamination")
# recalulate quality score as some completeness might be recalibrated.
Q.eval("Quality_score = Completeness - 5* Contamination", inplace=True)
quality_score = Q.Quality_score

assert (
    not quality_score.isnull().any()
), "I have NA quality values for thq quality score, it seems not all of the values defined in the quality_score_formula are presentfor all entries in tables/Genome_quality.tsv "


# select representative
logging.info("Select representative")
mag2Species["Representative"] = gd.best_genome_from_table(
    mag2Species.Species, quality_score
)

mag2Species.to_csv(snakemake.output.bins2species, sep="\t")

# mag2Species = mag2Species.join(Q)
Q.to_csv(snakemake.output.bin_info, sep="\t")

Python Pandas numpy utils networkx From line 1 of scripts/cluster_species.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

logging.captureWarnings(True)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts

import pandas as pd
from utils.parsers import read_checkm2_output


def main(samples, completeness_files, bin_table):
    sample_data = {}
    div = {}

    df_list = []

    for i, sample in enumerate(samples):
        sample_data = read_checkm2_output(completness_table=completeness_files[i])
        sample_data["Sample"] = sample

        df_list.append(sample_data)

    df = pd.concat(df_list, axis=0)

    df.to_csv(bin_table, sep="\t")


if __name__ == "__main__":
    main(
        samples=snakemake.params.samples,
        completeness_files=snakemake.input.completeness_files,
        bin_table=snakemake.output.bin_table,
    )

Python Pandas utils From line 1 of scripts/combine_checkm2.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


import pandas as pd
from utils.parsers_bbmap import parse_pileup_log_file


def parse_map_stats(sample_data, out_tsv):
    sample_stats = {}
    for sample in sample_data.keys():
        df = pd.read_csv(sample_data[sample]["contig_stats"], sep="\t")

        assert df.shape[0] == 1, "Assumed only one row in file {}; found {}".format(
            sample_data[sample]["contig_stats"], df.iloc[0]
        )

        # n genes
        genes_df = pd.read_csv(sample_data[sample]["gene_table"], index_col=0, sep="\t")
        df["N_Predicted_Genes"] = genes_df.shape[0]

        # mappingt stats
        mapping_stats = parse_pileup_log_file(sample_data[sample]["mapping_log"])
        df["Assembled_Reads"] = mapping_stats["Mapped reads"]
        df["Percent_Assembled_Reads"] = mapping_stats["Percent mapped"]

        logging.info(f"Stats for sample {sample}\n{df}")

        sample_stats[sample] = df

    stats_df = pd.concat(sample_stats, axis=0)
    stats_df.index = stats_df.index.get_level_values(0)
    # remove contig stats and keep only scaffold stats
    stats_df = stats_df.loc[:, ~stats_df.columns.str.startswith("scaf_")]
    stats_df.columns = stats_df.columns.str.replace("ctg_", "")
    # save
    stats_df.to_csv(out_tsv, sep="\t")
    return stats_df


def main(samples, contig_stats, gene_tables, mapping_logs, combined_stats):
    sample_data = {}
    for sample in samples:
        sample_data[sample] = {}
        for c_stat in contig_stats:
            # underscore version was for simplified local testing
            # if "%s_" % sample in c_stat:
            if "%s/" % sample in c_stat:
                sample_data[sample]["contig_stats"] = c_stat
        for g_table in gene_tables:
            # if "%s_" % sample in g_table:
            if "%s/" % sample in g_table:
                sample_data[sample]["gene_table"] = g_table
        for mapping_log in mapping_logs:
            # if "%s_" % sample in mapping_log:
            if "%s/" % sample in mapping_log:
                sample_data[sample]["mapping_log"] = mapping_log

    parse_map_stats(sample_data, combined_stats)


if __name__ == "__main__":
    main(
        samples=snakemake.params.samples,
        contig_stats=snakemake.input.contig_stats,
        gene_tables=snakemake.input.gene_tables,
        mapping_logs=snakemake.input.mapping_logs,
        combined_stats=snakemake.output.combined_contig_stats,
    )

Python Pandas utils From line 1 of scripts/combine_contig_stats.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


import pandas as pd
import os, gc
from utils.parsers_bbmap import read_coverage_binned, combine_coverages


contig2genome = pd.read_csv(
    snakemake.input.contig2genome, header=None, index_col=0, sep="\t"
).iloc[:, 0]


# sum counts
logging.info("Loading counts and coverage per contig")

combined_cov, Counts_contigs = combine_coverages(
    snakemake.input.coverage_files, snakemake.params.samples
)

combined_cov = combined_cov.T

combined_cov.insert(
    0, "Genome", value=pd.Categorical(contig2genome.loc[combined_cov.index].values)
)

logging.info(f"Saving coverage to {snakemake.output.coverage_contigs}")

combined_cov.reset_index().to_parquet(snakemake.output.coverage_contigs)

logging.info("Sum counts per genome")

Counts_genome = Counts_contigs.groupby(contig2genome, axis=1).sum().T
Counts_genome.index.name = "Sample"

logging.info(f"Saving counts to {snakemake.output.counts}")

Counts_genome.reset_index().to_parquet(snakemake.output.counts)
del Counts_genome, combined_cov, Counts_contigs
gc.collect()

# Binned coverage
logging.info("Loading binned coverage")
binCov = {}
for i, cov_file in enumerate(snakemake.input.binned_coverage_files):
    sample = snakemake.params.samples[i]

    binCov[sample] = read_coverage_binned(cov_file)

binCov = pd.DataFrame.from_dict(binCov)

logging.info("Add genome information to it")
binCov.insert(
    0,
    "Genome",
    value=pd.Categorical(contig2genome.loc[binCov.index.get_level_values(0)].values),
)

gc.collect()
logging.info(f"Saving combined binCov to {snakemake.output.binned_cov}")
binCov.reset_index().to_parquet(snakemake.output.binned_cov)

# Median coverage
logging.info("Calculate median coverage")
Median_abund = binCov.groupby("Genome").median().T
del binCov
gc.collect()
logging.info(f"Saving mediuan coverage {snakemake.output.median_abund}")
Median_abund.reset_index().to_parquet(snakemake.output.median_abund)

Python Pandas utils From line 1 of scripts/combine_coverage_MAGs.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


from pathlib import Path
import numpy as np
import pandas as pd
from collections import defaultdict

db_columns = {
    "kegg": ["ko_id", "kegg_hit"],
    "peptidase": [
        "peptidase_id",
        "peptidase_family",
        "peptidase_hit",
        "peptidase_RBH",
        "peptidase_identity",
        "peptidase_bitScore",
        "peptidase_eVal",
    ],
    "pfam": ["pfam_hits"],
    "cazy": ["cazy_ids", "cazy_hits", "cazy_subfam_ec", "cazy_best_hit"],
    # "heme": ["heme_regulatory_motif_count"],
}

Tables = defaultdict(list)

for file in snakemake.input:
    df = pd.read_csv(file, index_col=0, sep="\t")

    # drop un-annotated genes
    df = df.query("rank!='E'")

    # change index from 'subset1_Gene111' ->  simply 'Gene111'
    # Gene name to nr
    df.index = (
        df.index.str.split("_", n=1, expand=True)
        .get_level_values(1)
        .str[len("Gene") :]
        .astype(np.int64)
    )
    df.index.name = "GeneNr"

    # select columns, drop na rows and append to list
    for db in db_columns:
        cols = db_columns[db]

        if not df.columns.intersection(cols).empty:

            Tables[db].append(df[cols].dropna(axis=0, how="all"))

    del df

out_dir = Path(snakemake.output[0])
out_dir.mkdir()

for db in Tables:

    combined = pd.concat(Tables[db], axis=0)

    combined.sort_index(inplace=True)

    combined.reset_index().to_parquet(out_dir / (db + ".parquet"))

Python Pandas numpy From line 1 of scripts/combine_dram_gene_annotations.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of script
import numpy as np
import pandas as pd
import gc, os


import h5py

import h5py

import psutil


def measure_memory(write_log_entry=True):
    mem_uage = psutil.Process().memory_info().rss / (1024 * 1024)

    if write_log_entry:
        logging.info(f"The process is currently using {mem_uage: 7.0f} MB of RAM")

    return mem_uage


logging.info("Start")
measure_memory()

N_samples = len(snakemake.input.covstats)

logging.info("Read gene info")

gene_info = pd.read_table(snakemake.input.info)

# Gene name is only first part of first column
gene_info.index = gene_info["#Name"].str.split(" ", n=1, expand=True)[0]
gene_info.index.name = "GeneName"
gene_info.drop("#Name", axis=1, inplace=True)

gene_info.sort_index(inplace=True)
N_genes = gene_info.shape[0]
# gene_list= gene_info.index

# Sort
gene_info.sort_index(inplace=True)
N_genes = gene_info.shape[0]

gene_info[
    ["Samples_nz_coverage", "Samples_nz_counts", "Sum_coverage", "Max_coverage"]
] = 0


# gene_list= gene_info.index


logging.info("Open hdf files for writing")

gene_matrix_shape = (N_samples, N_genes)

with h5py.File(snakemake.output.cov, "w") as hdf_cov_file, h5py.File(
    snakemake.output.counts, "w"
) as hdf_counts_file:
    combined_cov = hdf_cov_file.create_dataset(
        "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip"
    )
    combined_counts = hdf_counts_file.create_dataset(
        "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip"
    )

    # add Smaple names attribute
    sample_names = np.array(list(snakemake.params.samples)).astype("S")
    combined_cov.attrs["sample_names"] = sample_names
    combined_counts.attrs["sample_names"] = sample_names

    gc.collect()

    Summary = {}

    logging.info("Start reading files")
    initial_mem_uage = measure_memory()

    for i, sample in enumerate(snakemake.params.samples):
        logging.info(f"Read coverage file for sample {i+1} / {N_samples}")
        sample_cov_file = snakemake.input.covstats[i]

        data = pd.read_parquet(
            sample_cov_file, columns=["GeneName", "Reads", "Median_fold"]
        ).set_index("GeneName")

        assert (
            data.shape[0] == N_genes
        ), f"I only have {data.shape[0]} /{N_genes} in the file {sample_cov_file}"

        # genes are not sorted :-()
        assert (
            data.index.is_monotonic_increasing
        ), f"data is not sorted by index in {sample_cov_file}"

        # downcast data
        # median is int
        Median_fold = pd.to_numeric(data.Median_fold, downcast="integer")
        Reads = pd.to_numeric(data.Reads, downcast="integer")

        # delete interminate data and release mem
        del data

        # get summary statistics per sample
        logging.debug("Extract Summary statistics")

        Summary[sample] = {
            "Sum_coverage": Median_fold.sum(),
            "Total_counts": Reads.sum(),
            "Genes_nz_counts": (Reads > 0).sum(),
            "Genes_nz_coverage": (Median_fold > 0).sum(),
        }

        # get gene wise stats
        gene_info["Samples_nz_counts"] += (Reads > 0) * 1
        gene_info["Samples_nz_coverage"] += (Median_fold > 0) * 1
        gene_info["Sum_coverage"] += Median_fold

        gene_info["Max_coverage"] = np.fmax(gene_info["Max_coverage"], Median_fold)

        combined_cov[i, :] = Median_fold.values
        combined_counts[i, :] = Reads.values

        del Median_fold, Reads
        gc.collect()

        current_mem_uage = measure_memory()


logging.info("All samples processed")
gc.collect()

logging.info("Save sample Summary")
pd.DataFrame(Summary).T.to_csv(snakemake.output.sample_info, sep="\t")


logging.info("Save gene Summary")

# downcast
for col in gene_info.columns:
    if col == "GC":
        gene_info[col] = pd.to_numeric(gene_info[col], downcast="float")
    else:
        gene_info[col] = pd.to_numeric(gene_info[col], downcast="integer")

gene_info.reset_index().to_parquet(snakemake.output.gene_info)

Python Pandas numpy h5py psutil From line 2 of scripts/combine_gene_coverages.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts

import pandas as pd
import numpy as np
from utils.taxonomy import tax2table

from glob import glob

gtdb_classify_folder = snakemake.input.folder

taxonomy_files = glob(f"{gtdb_classify_folder}/gtdbtk.*.summary.tsv")

N_taxonomy_files = len(taxonomy_files)
logging.info(f"Found {N_taxonomy_files} gtdb taxonomy files.")

if (0 == N_taxonomy_files) or (N_taxonomy_files > 2):
    raise Exception(
        f"Found {N_taxonomy_files} number of taxonomy files 'gtdbtk.*.summary.tsv' in {gtdb_classify_folder} expect 1 or 2."
    )


DT = pd.concat([pd.read_table(file, index_col=0) for file in taxonomy_files], axis=0)

DT.to_csv(snakemake.output.combined)

Tax = tax2table(DT.classification, remove_prefix=True)
Tax.to_csv(snakemake.output.taxonomy, sep="\t")

Python Pandas numpy utils From line 2 of scripts/combine_taxonomy.py

import os
import sys
import re


def main(jgi_file):
    # parsing input
    header = {}
    col2keep = ["contigName", "contigLen", "totalAvgDepth"]
    with open(jgi_file) as inF:
        for i, line in enumerate(inF):
            line = line.rstrip().split("\t")
            if i == 0:
                header = {x: ii for ii, x in enumerate(line)}
                col2keep += [x for x in line if x.endswith(".bam")]
                print("\t".join(col2keep))
                continue
            elif line[0] == "":
                continue
            # contig ID
            contig = line[header["contigName"]]
            # collect per-sample info
            out = []
            for col in col2keep:
                out.append(line[header[col]])
            print("\t".join(out))


if __name__ == "__main__":
    if "snakemake" in globals():
        with open(snakemake.log[0], "w") as log:
            sys.stderr = log

            with open(snakemake.output[0], "w") as outf:
                sys.stdout = outf

                main(snakemake.input[0])

    else:
        import argparse
        import logging

        logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG)

        class CustomFormatter(
            argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
        ):
            pass

        desc = (
            "Converting jgi_summarize_bam_contig_depths output to format used by VAMB"
        )
        epi = """DESCRIPTION:
        Output format: contigName<tab>contigLen<tab>totalAvgDepth<tab>SAMPLE1.sort.bam<tab>Sample2.sort.bam<tab>...
        Output written to STDOUT
        """
        parser = argparse.ArgumentParser(
            description=desc, epilog=epi, formatter_class=CustomFormatter
        )
        argparse.ArgumentDefaultsHelpFormatter
        parser.add_argument(
            "jgi_file",
            metavar="jgi_file",
            type=str,
            help="jgi_summarize_bam_contig_depths output table",
        )
        parser.add_argument("--version", action="version", version="0.0.1")

        args = parser.parse_args()
        main(args.jgi_file)

Python Snakemake From line 2 of scripts/convert_jgi2vamb_coverage.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


import pandas as pd

annotation_file = snakemake.input.annotations
module_output_table = snakemake.output[0]

from mag_annotator.database_handler import DatabaseHandler
from mag_annotator.summarize_genomes import build_module_net, make_module_coverage_frame

annotations = pd.read_csv(annotation_file, sep="\t", index_col=0)


# get db_locs and read in dbs
database_handler = DatabaseHandler(logger=logging, config_loc=snakemake.input.config)


if "module_step_form" not in database_handler.config["dram_sheets"]:
    raise ValueError(
        "Module step form location must be set in order to summarize genomes"
    )

module_steps_form = pd.read_csv(
    database_handler.config["dram_sheets"]["module_step_form"], sep="\t"
)

all_module_nets = {
    module: build_module_net(module_df)
    for module, module_df in module_steps_form.groupby("module")
}

module_coverage_frame = make_module_coverage_frame(
    annotations, all_module_nets, groupby_column="fasta"
)

module_coverage_frame.to_csv(module_output_table, sep="\t")

Python Pandas From line 4 of scripts/DRAM_get_all_modules.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


import pyfastx


faa_iterator = pyfastx.Fastx(snakemake.input.faa, format="fasta")
fna_iterator = pyfastx.Fastx(snakemake.input.fna, format="fasta")


with open(snakemake.output.faa, "w") as out_faa, open(
    snakemake.output.fna, "w"
) as out_fna, open(snakemake.output.short, "w") as out_short:
    for name, seq, comment in fna_iterator:
        protein = next(faa_iterator)

        # include gene and corresponding protein if gene passes length threshold
        # or annotation contains prodigal info that it's complete
        if (len(seq) >= snakemake.params.minlength_nt) or ("partial=00" in comment):
            out_fna.write(f">{name} {comment}\n{seq}\n")
            out_faa.write(">{0} {2}\n{1}\n".format(*protein))

        else:
            out_short.write(">{0} {2}\n{1}\n".format(*protein))

Python prodigal pyfastx From line 4 of scripts/filter_genes.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


import pandas as pd
from glob import glob
from numpy import log

from utils.parsers import load_quality


Q = load_quality(snakemake.input.quality)

stats = pd.read_csv(snakemake.input.stats, index_col=0, sep="\t")
stats["logN50"] = log(stats.N50)

# merge table but only shared Bins and non overlapping columns
Q = Q.join(stats.loc[Q.index, stats.columns.difference(Q.columns)])
del stats

n_all_bins = Q.shape[0]

filter_criteria = snakemake.params["filter_criteria"]
logging.info(f"Filter genomes according to criteria:\n {filter_criteria}")


Q = Q.query(filter_criteria)

logging.info(f"Retain {Q.shape[0]} genomes from {n_all_bins}")


## GUNC

if hasattr(snakemake.input, "gunc"):
    gunc = pd.read_table(snakemake.input.gunc, index_col=0)
    gunc = gunc.loc[Q.index]

    bad_genomes = gunc.index[gunc["pass.GUNC"] == False]
    logging.info(f"{len(bad_genomes)} Don't pass gunc filtering")

    Q.drop(bad_genomes, inplace=True)
else:
    logging.info(" Don't filter based on gunc")


if Q.shape[0] == 0:
    logging.error(
        f"No bins passed filtering criteria! Bad luck!. You might want to tweek the filtering criteria. Also check the {snakemake.input.quality}"
    )
    exit(1)

# output Q together with quality
Q.to_csv(snakemake.output.info, sep="\t")


# filter path genomes for skani

F = pd.read_table(snakemake.input.paths, index_col=0).squeeze()

F = F.loc[Q.index].iloc[:, 0]
F.to_csv(snakemake.output.paths, index=False, header=False)

Python Pandas numpy utils gunc From line 4 of scripts/filter_genomes.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

#### Begining of script

import pandas as pd
from utils import gene_scripts

# if MAGs are renamed I need to obtain the old contig names
# otherwise not
if snakemake.params.renamed_contigs:
    contigs2bins = pd.read_csv(
        snakemake.input.contigs2bins, index_col=0, sep="\t", header=None
    )

    contigs2bins.columns = ["Bin"]
    old2newID = pd.read_csv(snakemake.input.old2newID, index_col=0, sep="\t").squeeze()

    contigs2genome = contigs2bins.join(old2newID, on="Bin").dropna().drop("Bin", axis=1)
else:
    contigs2genome = pd.read_csv(
        snakemake.input.contigs2mags, index_col=0, squeeze=False, sep="\t", header=None
    )
    contigs2genome.columns = ["MAG"]

# load orf_info
orf_info = pd.read_parquet(snakemake.input.orf_info)


# recreate Contig name `Sample_ContigNr` and Gene names `Gene0004`
orf_info["Contig"] = orf_info.Sample + "_" + orf_info.ContigNr.astype(str)
orf_info["Gene"] = gene_scripts.geneNr_to_string(orf_info.GeneNr)

# Join genomes on contig
orf_info = orf_info.join(contigs2genome, on="Contig")

# remove genes not on genomes
orf_info = orf_info.dropna(axis=0)


# count genes per genome in a matrix
gene2genome = pd.to_numeric(
    orf_info.groupby(["Gene", "MAG"]).size(), downcast="unsigned"
).unstack(fill_value=0)

# save as parquet
gene2genome.reset_index().to_parquet(snakemake.output[0])

Python Pandas utils From line 2 of scripts/gene2genome.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


## Start

import pandas as pd
import numpy as np

from utils import gene_scripts

# CLuterID    GeneID    empty third column
orf2gene = pd.read_csv(
    snakemake.input.cluster_attribution, header=None, sep="\t", usecols=[0, 1]
)

orf2gene.columns = ["Representative", "ORF"]

# split orf names in sample, contig_nr, and orf_nr
orf_info = gene_scripts.split_orf_to_index(orf2gene.ORF)

# rename representative

representative_names = orf2gene.Representative.unique()

map_names = pd.Series(
    index=representative_names,
    data=np.arange(1, len(representative_names) + 1, dtype=np.uint),
)


orf_info["GeneNr"] = orf2gene.Representative.map(map_names)


orf_info.to_parquet(snakemake.output.cluster_attribution)


# Save name of representatives
map_names.index.name = "Representative"
map_names.name = "GeneNr"
map_names.to_csv(snakemake.output.rep2genenr, sep="\t")

Python Pandas numpy utils From line 4 of scripts/generate_orf_info.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.DEBUG,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


# start of script
import argparse
import os, sys
import shutil
import warnings

import pandas as pd
from Bio import SeqIO


def get_fasta_of_bins(cluster_attribution, contigs_file, out_folder):
    """
    Creates individual fasta files for each bin using the contigs fasta and the cluster attribution.

    input:
    - cluster attribution file:   tab seperated file of "contig_fasta_header    bin"
    - contigs:                    fasta file of contigs
    - out_prefix:                 output_prefix for bin fastas  {out_folder}/{binid}.fasta
    """
    # create outdir
    if os.path.exists(out_folder):
        shutil.rmtree(out_folder)
    os.makedirs(out_folder)

    CA = pd.read_csv(cluster_attribution, header=None, sep="\t", dtype=str)

    assert CA.shape[1] == 2, "File should have only two columns " + cluster_attribution

    CA.columns = ["Contig", "Bin"]

    # # assert that Contig is unique
    # assert CA.Contig.is_unique, (
    #     f"First column of file {cluster_attribution} should be contigs, hence unique"
    #     f"I got\n{CA.head()}"
    # )

    logging.info(f"index fasta file {contigs_file} for fast access")
    contig_fasta_dict = SeqIO.index(str(contigs_file), "fasta")

    assert len(contig_fasta_dict) > 0, "No contigs in your fasta"

    unique_bins = CA.Bin.unique()

    assert len(unique_bins) >= 1, "No bins found"

    for binid in unique_bins:
        bin_contig_names = CA.loc[CA.Bin == binid, "Contig"].tolist()
        out_file = os.path.join(out_folder, f"{binid}.fasta")

        assert (
            len(bin_contig_names) >= 1
        ), f"No contigs found for bin {binid} in {cluster_attribution}"

        if len(bin_contig_names) == 1:
            warnings.warn(f"single contig bin Bin : {binid} {bin_contig_names}")

        logging.debug(f"Found {len(bin_contig_names)} contigs {bin_contig_names}")

        fasta_contigs = [contig_fasta_dict[c] for c in bin_contig_names]
        SeqIO.write(fasta_contigs, out_file, "fasta")


if __name__ == "__main__":
    if "snakemake" not in globals():
        p = argparse.ArgumentParser()
        p.add_argument("--cluster-attribution")
        p.add_argument("--contigs")
        p.add_argument("--out-folder")
        args = vars(p.parse_args())
        get_fasta_of_bins(**args)
    else:
        get_fasta_of_bins(
            snakemake.input.cluster_attribution,
            snakemake.input.contigs,
            snakemake.output[0],
        )

Python Snakemake Pandas Biopython From line 4 of scripts/get_fasta_of_bins.py

import os, sys
import logging, traceback


logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


# begining of script

import datetime
import shutil
import os


timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%X")


def get_read_stats(fraction, params_in):
    "get read stats by running reformat.sh"

    from snakemake.shell import shell

    subfolder = os.path.join(snakemake.params.folder, fraction)
    tmp_file = os.path.join(subfolder, "read_stats.tmp")
    shell(
        f" mkdir -p {subfolder} 2>> {snakemake.log[0]} "
        " ; "
        f" reformat.sh {params_in} "
        f" bhist={subfolder}/base_hist.txt "
        f" qhist={subfolder}/quality_by_pos.txt "
        f" lhist={subfolder}/readlength.txt "
        f" gchist={subfolder}/gc_hist.txt "
        " gcbins=auto "
        f" bqhist={subfolder}/boxplot_quality.txt "
        f" threads={snakemake.threads} "
        " overwrite=true "
        f" -Xmx{snakemake.resources.java_mem}G "
        f" 2> >(tee -a {snakemake.log[0]} {tmp_file} ) "
    )
    content = open(tmp_file).read()
    pos = content.find("Input:")
    if pos == -1:
        raise Exception("Didn't find read number in file:\n\n" + content)
    else:
        content[pos:].split()[1:4]
        # Input:    123 reads   1234 bases
        n_reads, _, n_bases = content[pos:].split()[1:4]

        os.remove(tmp_file)
    return int(n_reads), int(n_bases)


if len(snakemake.input) >= 2:
    n_reads_pe, n_bases_pe = get_read_stats(
        "pe", "in1={0} in2={1}".format(*snakemake.input)
    )

    n_reads_pe = n_reads_pe / 2

    headers = [
        "Sample",
        "Step",
        "Total_Reads",
        "Total_Bases",
        "Reads_pe",
        "Bases_pe",
        "Reads_se",
        "Bases_se",
        "Timestamp",
    ]

    if os.path.exists(snakemake.params.single_end_file):
        n_reads_se, n_bases_se = get_read_stats(
            "se", "in=" + snakemake.params.single_end_file
        )
    else:
        n_reads_se, n_bases_se = 0, 0

    values = [
        n_reads_pe + n_reads_se,
        n_bases_pe + n_bases_se,
        n_reads_pe,
        n_bases_pe,
        n_reads_se,
        n_bases_se,
    ]
else:
    headers = [
        "Sample",
        "Step",
        "Total_Reads",
        "Total_Bases",
        "Reads",
        "Bases",
        "Timestamp",
    ]
    values = 2 * get_read_stats("", "in=" + snakemake.input[0])

with open(snakemake.output.read_counts, "w") as f:
    f.write("\t".join(headers) + "\n")
    f.write(
        "\t".join(
            [snakemake.wildcards.sample, snakemake.wildcards.step]
            + [str(v) for v in values]
            + [timestamp]
        )
        + "\n"
    )

shutil.make_archive(snakemake.params.folder, "zip", snakemake.params.folder)
shutil.rmtree(snakemake.params.folder)

Python Snakemake From line 2 of scripts/get_read_stats.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

from utils.fasta import parse_fasta_headers
from utils.utils import gen_names_for_range
from glob import glob
import pandas as pd


fasta_files = glob(f"{snakemake.input[0]}/*{snakemake.params.extension}")

if len(fasta_files) > 0:
    Bin_names = gen_names_for_range(
        N=len(fasta_files), prefix=f"{snakemake.wildcards.sample}_SemiBin_"
    )

    mappings = []

    for bin_name, fasta in zip(Bin_names, fasta_files):
        contigs = parse_fasta_headers(fasta)

        mappings.append(pd.Series(data=bin_name, index=contigs))

    pd.concat(mappings, axis=0).to_csv(
        snakemake.output[0], sep="\t", header=False, index=True
    )

else:
    logging.warning(
        f"No bins found in {snakemake.input[0]} add longest contig as bin to make atlas continue."
    )

    with open(snakemake.output[0], "w") as outf:
        outf.write("{sample}_1\t{sample}_SemiBin_1\n".format(**snakemake.wildcards))

Python Pandas utils From line 1 of scripts/parse_semibin.py

import os, sys
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


import pandas as pd
from pathlib import Path
from utils.utils import gen_names_for_range
from utils.fasta import parse_fasta_headers


fasta_extension = snakemake.params.fasta_extension
separator = snakemake.params.separator

# path with {sample} to replace
cluster_output_path = snakemake.params.output_path


# cluster.tsv.gz file for all samples of all bingroups
output_culsters = snakemake.output.renamed_clusters


all_clusters = []


for i in range(len(list(snakemake.input))):
    vamb_folder = Path(snakemake.input[i])

    bingroup = snakemake.params.bingroups[i]

    logging.info(f"Parse vamb output for bingroup {bingroup}")

    # path to the bins folder
    bin_dir = vamb_folder / "bins"
    vamb_cluster_file = vamb_folder / "clusters.tsv"

    # Get a list of binds that are big enough. Not all bins in the vamb_cluster_file, pass the size filter
    big_bins = []

    for file in os.listdir(bin_dir):
        bin_name, extension = os.path.splitext(file)

        logging.debug(f"Found file {bin_name} with extension {extension}")

        if extension == fasta_extension:
            big_bins.append(bin_name)

    logging.info(
        f"Found {len(big_bins)} bins created by Vamb (above size limit)\n"
        f"E.g. {big_bins[:5]}"
    )

    logging.info(f"Load vamb cluster file {vamb_cluster_file}")
    clusters_contigs = pd.read_table(vamb_cluster_file, header=None)
    clusters_contigs.columns = ["OriginalName", "Contig"]

    # split contigs by separator. This is mainly done for compatibility with SemiBin
    clusters = clusters_contigs.Contig.str.rsplit(separator, n=1, expand=True)
    clusters.columns = ["Sample", "Contig"]

    # get number of BinID given by vamb, prefix with bingroup
    clusters["BinId"] = (
        bingroup
        + clusters_contigs.OriginalName.str.rsplit(separator, n=1, expand=True)[1]
    )

    # Add information if the bin is large enough
    clusters["OriginalName"] = clusters_contigs.OriginalName
    clusters["Large_enough"] = clusters.OriginalName.isin(big_bins)

    # Add information about the bingroup
    clusters["BinGroup"] = bingroup

    all_clusters.append(clusters)

    del clusters_contigs


logging.info(f"Concatenate clusters of all bingroups")
clusters = pd.concat(all_clusters, axis=0)


n_bins = (
    clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique()
)
logging.info(
    f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}"
)


clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinId
clusters.loc[~clusters.Large_enough, "SampleBin"] = ""


logging.info(f"Write reformated table to {output_culsters}")
clusters.to_csv(output_culsters, sep="\t", index=False)

# filter for following
clusters = clusters.query("Large_enough")

logging.info(f"Write cluster_attribution for samples")
for sample, cl in clusters.groupby("Sample"):
    sample_output_path = cluster_output_path.format(sample=sample)

    logging.debug(f"Write file {sample_output_path}")
    cl[["Contig", "SampleBin"]].to_csv(
        sample_output_path, sep="\t", index=False, header=False
    )


samples_without_bins = set(snakemake.params.samples).difference(set(clusters.Sample))

if len(samples_without_bins) > 0:
    logging.warning(
        "The following samples did't yield bins, I add longest contig to make the pipline continue:\n"
        + "\n".join(samples_without_bins)
    )

    for sample in samples_without_bins:
        sample_output_path = cluster_output_path.format(sample=sample)
        with open(sample_output_path, "w") as fout:
            fout.write(f"{sample}_1\t{sample}_vamb_1\n")

Python Pandas utils SemiBin VAMB From line 1 of scripts/parse_vamb.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


from Bio import SeqIO

# Open the snakemake.output FASTA file and mapping table file for writing
with open(snakemake.output.fasta, "w") as output_handle, open(
    snakemake.output.mapping_table, "w"
) as mapping_table_handle:
    i = 1

    for record in SeqIO.parse(snakemake.input[0], "fasta"):
        if len(record) < snakemake.params.minlength:
            break

        old_name = record.id
        new_name = f"{snakemake.wildcards.sample}_{i}"
        record.id = new_name
        record.description = ""

        SeqIO.write(record, output_handle, "fasta")

        mapping_table_handle.write(f"{new_name}\t{old_name}\n")

        i += 1

Python Biopython From line 4 of scripts/rename_assembly.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


import pandas as pd
from utils.gene_scripts import geneNr_to_string


# Start


map_genenr = pd.read_csv(snakemake.input.rep2genenr, index_col=0, sep="\t").squeeze()


# from gene Nr to gene name
rep2gene = geneNr_to_string(map_genenr)

logging.info(
    f"Collect and rename representative genes according to:\n {rep2gene.head()}"
)

assert rep2gene.shape[0] > 0


with open(snakemake.output[0], "w") as fout:
    with open(snakemake.input.fasta, "r") as fin:
        for line in fin:
            if line[0] == ">":
                gene_name = line[1:].strip().split(" ")[0]

                gene_id = rep2gene.loc[gene_name]

                fout.write(f">{gene_id} {gene_name}\n")

            else:
                fout.write(line)

Python Pandas utils From line 4 of scripts/rename_genecatalog.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception


# start


from atlas import utils
import pandas as pd

# Bin     Filename        Proteins
paths = pd.read_csv(snakemake.input.paths, sep="\t", index_col=0).Filename
# genome  SpeciesNr       Species Representative
mapping = pd.read_csv(
    snakemake.input.mapping_file,
    sep="\t",
    index_col=0,
).squeeze()


# standardize names of representatives
# MAG001 ....
representatives = mapping.Representative.unique()
old2new_name = dict(
    zip(representatives, utils.gen_names_for_range(len(representatives), prefix="MAG"))
)
mapping["MAG"] = mapping.Representative.map(old2new_name)


# write cluster attribution
mapping[["MAG", "Representative"]].to_csv(
    snakemake.output.mapfile_allbins2mag, sep="\t", header=True
)

# write out old2new ids
old2new = mapping.loc[representatives, "MAG"]
old2new.index.name = "Representative"
old2new.to_csv(snakemake.output.mapfile_old2mag, sep="\t", header=True)


#### Write genomes and contig to genome mapping file
output_dir = snakemake.output.dir
mapfile_contigs = snakemake.output.mapfile_contigs
rename_contigs = snakemake.params.rename_contigs


os.makedirs(output_dir)

with open(mapfile_contigs, "w") as out_contigs:
    for rep in representatives:
        fasta_in = paths.loc[rep]
        new_name = old2new.loc[rep]

        fasta_out = os.path.join(output_dir, f"{new_name}.fasta")

        # write names of contigs in mapping file
        with open(fasta_in) as ffi, open(fasta_out, "w") as ffo:
            Nseq = 0
            for line in ffi:
                # if header line
                if line[0] == ">":
                    Nseq += 1

                    if rename_contigs:
                        new_header = f"{new_name}_{Nseq}"
                    else:
                        new_header = line[1:].strip().split()[0]

                    # write to contig to mapping file
                    out_contigs.write(f"{new_header}\t{new_name}\n")
                    # write to fasta file
                    ffo.write(f">{new_header}\n")
                else:
                    ffo.write(line)


# rename quality
def rename_quality(quality_in, quality_out, old2new_name):
    Q = pd.read_csv(quality_in, index_col=0, sep="\t")

    Q = Q.loc[old2new_name.keys()].rename(index=old2new_name)

    Q.to_csv(quality_out, sep="\t")


rename_quality(
    quality_in=snakemake.input.genome_info,
    quality_out=snakemake.output.genome_info,
    old2new_name=old2new_name,
)

Python Pandas Atlas From line 3 of scripts/rename_genomes.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

# start
import ete3

T = ete3.Tree(snakemake.input.tree, quoted_node_names=True, format=1)

try:
    T.unroot()
    if len(T) > 2:
        T.set_outgroup(T.get_midpoint_outgroup())

except Exception as e:
    logging.error("Failed to root tree, keep unrooted. Reason was:\n\n" + str(e))


T.write(outfile=snakemake.output.tree)

Python ete3 From line 2 of scripts/root_tree.py

import sys, os
import logging, traceback

logging.basicConfig(
    filename=snakemake.log[0],
    level=logging.INFO,
    format="%(asctime)s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
    if issubclass(exc_type, KeyboardInterrupt):
        sys.__excepthook__(exc_type, exc_value, exc_traceback)
        return

    logging.error(
        "".join(
            [
                "Uncaught exception: ",
                *traceback.format_exception(exc_type, exc_value, exc_traceback),
            ]
        )
    )


# Install exception handler
sys.excepthook = handle_exception

## start


from utils import fasta

fasta.split(
    snakemake.input[0],
    snakemake.params.subset_size,
    snakemake.output[0],
    simplify_headers=True,
)

Python utils From line 4 of scripts/split_genecatalog.py

__author__ = "Christopher Schröder, Patrik Smeds"
__copyright__ = "Copyright 2020, Christopher Schröder, Patrik Smeds"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"

from os import path
from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided.")
elif len(snakemake.input) > 1:
    raise ValueError("Please provide exactly one reference genome as input.")

valid_suffixes = {".0123", ".amb", ".ann", ".bwt.2bit.64", ".pac"}


def get_valid_suffix(path):
    for suffix in valid_suffixes:
        if path.endswith(suffix):
            return suffix


prefixes = set()
for s in snakemake.output:
    suffix = get_valid_suffix(s)
    if suffix is None:
        raise ValueError(f"{s} cannot be generated by bwa-mem2 index (invalid suffix).")
    prefixes.add(s[: -len(suffix)])

if len(prefixes) != 1:
    raise ValueError("Output files must share common prefix up to their file endings.")
(prefix,) = prefixes

shell("bwa-mem2 index -p {prefix} {snakemake.input[0]} {log}")

Python Snakemake Bwa-mem2 From line 1 of index/wrapper.py

__author__ = "Christopher Schröder, Johannes Köster, Julian de Ruiter"
__copyright__ = (
    "Copyright 2020, Christopher Schröder, Johannes Köster and Julian de Ruiter"
)
__email__ = "[email protected] [email protected], [email protected]"
__license__ = "MIT"


import tempfile
from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts
from snakemake_wrapper_utils.samtools import get_samtools_opts


# Extract arguments.
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sort", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")
samtools_opts = get_samtools_opts(snakemake)
java_opts = get_java_opts(snakemake)


index = snakemake.input.get("index", "")
if isinstance(index, str):
    index = path.splitext(snakemake.input.idx)[0]
else:
    index = path.splitext(snakemake.input.idx[0])[0]


# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
    1,
    2,
}:
    raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements")

if sort_order not in {"coordinate", "queryname"}:
    raise ValueError(f"Unexpected value for sort_order ({sort_order})")

# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":
    # Simply convert to bam using samtools view.
    pipe_cmd = "samtools view {samtools_opts}"

elif sort == "samtools":
    # Sort alignments using samtools sort.
    pipe_cmd = "samtools sort {samtools_opts} {sort_extra} -T {tmpdir}"

    # Add name flag if needed.
    if sort_order == "queryname":
        sort_extra += " -n"

elif sort == "picard":
    # Sort alignments using picard SortSam.
    pipe_cmd = "picard SortSam {java_opts} {sort_extra} --INPUT /dev/stdin --TMP_DIR {tmpdir} --SORT_ORDER {sort_order} --OUTPUT {snakemake.output[0]}"

else:
    raise ValueError(f"Unexpected value for params.sort ({sort})")


with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(bwa-mem2 mem"
        " -t {snakemake.threads}"
        " {extra}"
        " {index}"
        " {snakemake.input.reads}"
        " | " + pipe_cmd + ") {log}"
    )

Python Snakemake SAMtools Picard snakemake-wrapper-utils From line 1 of mem/wrapper.py

__author__ = "Tom Poorten"
__copyright__ = "Copyright 2017, Tom Poorten"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import infer_out_format
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake, parse_output=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sorting", "none")
sort_extra = snakemake.params.get("sort_extra", "")

out_ext = infer_out_format(snakemake.output[0])

pipe_cmd = ""
if out_ext != "PAF":
    # Add option for SAM output
    extra += " -a"

    # Determine which pipe command to use for converting to bam or sorting.
    if sort == "none":

        if out_ext != "SAM":
            # Simply convert to output format using samtools view.
            pipe_cmd = f"| samtools view -h {samtools_opts}"

    elif sort in ["coordinate", "queryname"]:

        # Add name flag if needed.
        if sort == "queryname":
            sort_extra += " -n"

        # Sort alignments.
        pipe_cmd = f"| samtools sort {sort_extra} {samtools_opts}"

    else:
        raise ValueError(f"Unexpected value for params.sort: {sort}")


shell(
    "(minimap2"
    " -t {snakemake.threads}"
    " {extra} "
    " {snakemake.input.target}"
    " {snakemake.input.query}"
    " {pipe_cmd}"
    " > {snakemake.output[0]}"
    ") {log}"
)

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of aligner/wrapper.py

__author__ = "Tom Poorten"
__copyright__ = "Copyright 2017, Tom Poorten"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "(minimap2 -t {snakemake.threads} {extra} "
    "-d {snakemake.output[0]} {snakemake.input.target}) {log}"
)

Python Snakemake From line 1 of index/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts


bed = snakemake.input.get("bed", "")
if bed:
    bed = "-t " + bed

samtools_opts = get_samtools_opts(
    snakemake, parse_write_index=False, parse_output=False, parse_output_format=False
)


extra = snakemake.params.get("extra", "")
region = snakemake.params.get("region", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


shell(
    "samtools stats {samtools_opts} {extra} {snakemake.input.bam} {bed} {region} > {snakemake.output} {log}"
)

Python Snakemake SAMtools snakemake-wrapper-utils From line 3 of stats/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
# Set this to False if multiqc should use the actual input directly
# instead of parsing the folders where the provided files are located
use_input_files_only = snakemake.params.get("use_input_files_only", False)

if not use_input_files_only:
    input_data = set(path.dirname(fp) for fp in snakemake.input)
else:
    input_data = set(snakemake.input)

output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {extra}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_data}"
    " {log}"
)