A snakemake for fastCN reference setup

public 1yr ago Version: v0.2 0 bookmarks

View Workflow

The Snakefile is under workflow .

Installing

Most installing is done by Snakemake but there is one small Makefile. Please refer to it so you can properly set your LD path before using.

Configuration

See config/config.yaml for an example layout, and .test/config.yaml for a minimal test case.

Workflow layout

Workflow

TODO

add raw read depth counts

Code Snippets

script:
    "../scripts/unpack_partial_sam.py"

SnakeMake From line 18 of rules/cn-calling.smk

shell:
    """
    SAM_GC_correction \
            {input.fai} {input.bin} {input.exp} \
            $(dirname {output.binary})/{wildcards.sm}
    """

SnakeMake From line 37 of rules/cn-calling.smk

shell:
    """
    pigz -p {threads} -c {input.binary} > {output.zipped}
    """

SnakeMake From line 58 of rules/cn-calling.smk

shell:
    """
    perbp-to-windows.py \
        --depth {input.binary} \
        --out {output.windows} \
        --chromlen {input.fai} \
        --windows {input.ref_windows}
    """

SnakeMake From line 79 of rules/cn-calling.smk

shell:
    """
    depth-to-cn.py --in {input.windows} --autocontrol {input.control_bed} --chrX {input.chrX_control_bed}
    """

SnakeMake From line 108 of rules/cn-calling.smk

shell:
    """
    if [[ {input.reads} =~ \.(fasta|fasta.gz|fa|fa.gz|fastq|fastq.gz|fq|fq.gz)$ ]]; then 
        cat {input.reads} \
            | seqtk seq -F '#' \
            | rustybam fastq-split {output.reads} 
    elif [[ {input.reads} =~ \.(bam|cram|sam|sam.gz)$ ]]; then 
        samtools fasta -@ {threads} {input.reads} \
            | seqtk seq -F '#' \
            | rustybam fastq-split {output.reads} 
    fi 
    """

SnakeMake SAMtools seqtk rustybam From line 28 of rules/mapping.smk

shell:
    """
    mrsfast --index {input.ref}
    """

SnakeMake mrsFAST From line 55 of rules/mapping.smk

shell:
    """
    extract-from-fastq36.py --in {input.reads} \
        | mrsfast --search {input.ref} --seq /dev/stdin \
            --disable-nohits --mem {resources.total_mem} --threads {threads} \
            -e 2 --outcomp \
            -o $(dirname {output.sam})/{wildcards.scatteritem} \
        > {log} 2>&1
    """

SnakeMake mrsFAST From line 81 of rules/mapping.smk

shell:
    """
    zcat {input.sam} \
        | samtools view -b - \
        | samtools sort -@ {threads} \
         -T {resources.tmpdir} -m 2G \
         -o {output.bam} -
    """

SnakeMake SAMtools From line 109 of rules/mapping.smk

shell:
    """
    samtools merge -@ {threads} - {input.bams} -u \
        | samtools view -h - \
        | awk 'BEGIN {{OFS="\\t"}} {{print $1,$2,$3,$4,$5}}' \
        | pigz -p {threads} \
    > {output.merged}
    """

SnakeMake SAMtools From line 135 of rules/mapping.smk

script:
    "../scripts/compress_mrsfast.py"

SnakeMake From line 160 of rules/mapping.smk

run:
    if input.ref.endswith(".gz"):
        shell("gunzip -c {input.ref} > {output.ref_keep}")
    else:
        shell("ln -s $(readlink -f {input.ref}) {output.ref_keep}")

SnakeMake From line 13 of rules/quickmer.smk

shell:
    """
    quicKmer2 search -k {params.kmer} -t 20 -s 3G -e 2 -d 100 -w {params.window_size} -c {input.include_bed} {input.ref}
    """

SnakeMake From line 39 of rules/quickmer.smk

shell:
    """
    samtools faidx {input.ref}
    """

SnakeMake SAMtools From line 58 of rules/quickmer.smk

shell:
    """
    mkdir -p $(dirname {output.qm2})
    zcat {input.fastq} | seqtk seq -A -U -l 60 - | quicKmer2 count -t {threads} {input.ref} /dev/stdin $( echo {output.qm2_bin} | sed 's/.bin//' )
    """

SnakeMake seqtk From line 82 of rules/quickmer.smk

    shell:
        """
        quicKmer2 est {input.ref} $( echo {input.qm2_bin} | sed 's/.bin//' ) {output.bed}
        """


'''
rule quickmer_browser:
    input:
        qm2=rules.quickmer_est.output.bed
    output:
        browser_file = temp("temp/{sample}/windows/quickmer2/{sm}.depth.bed.CN.bed")
    conda:
        "../envs/env.yml"
    log:
        "logs/{sample}/quickmer/{sm}/bed.log"
    resources:
        mem=12,
        hrs=3
    threads: 1
    shell:
        """
        grep -v decoy {input.qm2} | grep -v chrEBV > {output.browser_file}
        """
'''

SnakeMake From line 104 of rules/quickmer.smk

shell:
    """
    zcat -f -- {input.rm} {input.trf} {input.gaps} \
        | cut -f 1-3 \
        | bedtools sort -i - -g {input.fai} \
        | bedtools slop -i - -g {input.fai} -b 36 \
        | bedtools merge -i - \
    > {output.bed}
    """

SnakeMake BEDTools From line 38 of rules/reference.smk

shell:
    """
    cat \
        <(bedtools slop -b 36 -g {input.fai} -i {input.wm}) \
        <(bedtools slop -b 10000 -g {input.fai} -i {input.sd}) \
        | cut -f 1-3 \
        | bedtools sort -i - -g {input.fai} \
        | bedtools merge -i - \
    > {output.bed}
    """

SnakeMake BEDTools From line 65 of rules/reference.smk

shell:
    """
    bedtools complement \
        -i {input.exclude} \
        -g {input.fai} \
        > {output.include}
    """

SnakeMake BEDTools From line 90 of rules/reference.smk

shell:
    """
    GC_control_gen \
        {input.fasta} \
        {input.exclude} \
        {input.mask} \
        {params.window} \
        {output.bin} \
        > {log} 2>&1
    """

SnakeMake fastCN From line 116 of rules/reference.smk

shell:
    """
    seqtk seq -M {input.mask} -n N \
        {input.fasta} -l 60 \
        > {output.fasta}
    samtools faidx {output.fasta}
    """

SnakeMake SAMtools seqtk From line 144 of rules/reference.smk

script:
    "../scripts/windows.py"

SnakeMake From line 168 of rules/reference.smk

shell:
    """
    zcat -f -- {input.mask} {input.exclude} \
        | bedtools sort -i - \
        | bedtools merge -i - \
        | bedtools subtract -N -f 0.75 -a {input.windows} -b - \
        | grep -v chrX \
        | grep -v chrY \
        > {output.bed}
    """

SnakeMake BEDTools From line 187 of rules/reference.smk

shell:
    """
    (zcat -f -- {input.mask} {input.exclude} \
        | bedtools sort -i - \
        | bedtools merge -i - \
        | bedtools subtract -N -f 0.75 -a {input.windows} -b - \
        | grep -w chrX  || true ) \
        > {output.bed}
    """

SnakeMake BEDTools From line 214 of rules/reference.smk

script:
    "../scripts/make_bed9.py"

SnakeMake From line 14 of rules/trackhub.smk

shell:
    """
    zcat {input.bed} > {output.bed}
    bedToBigBed -tab -type=bed9+1 \
        -as={params.as_file} \
        {output.bed} {input.fai} {output.bigbed}
    """

SnakeMake From line 35 of rules/trackhub.smk

script:
    "../scripts/make_trackdb.py"

SnakeMake From line 65 of rules/trackhub.smk

shell:
    """
    bedtools coverage -a {input.bed} -b {input.sat_bed} | cut -f 1-4,10,14 > {output.temp_sat}
    {params.sdir}/scripts/wssd_binary.py -b {output.temp_sat} -o {output.sat_bed}
    bedtools subtract -a {output.sat_bed} -b {input.gap_bed} | bedtools subtract -a - -b {input.cen_bed} > {output.wssd_bin}
    """

SnakeMake BEDTools From line 89 of rules/trackhub.smk

import gzip

file = gzip.open(snakemake.input.sam, "rt")

count = 0

line_cols = []


with gzip.open(snakemake.output.comp, "wt") as outfile:
    outfile.write("qname,flag,rname,pos,mapq\n")
    while True:
        line = file.readline().rstrip()
        if not line:
            break
        if line[0] == "@":
            continue
        if line.split("\t")[1:] == line_cols:
            count += 1
        elif line_cols == []:
            line_cols = line.split("\t")[1:]
            count = 1
        else:
            out_string = ",".join([str(count)] + line_cols)
            outfile.write(f"{out_string}\n")
            line_cols = line.split("\t")[1:]
            count = 1

file.close()

Python From line 5 of scripts/compress_mrsfast.py

import pandas as pd
import math

color_hash = {
    0: "229,229,229",
    1: "196,196,196",
    2: "0,0,0",
    3: "0,0,205",
    4: "65,105,225",
    5: "100,149,237",
    6: "180,238,180",
    7: "255,255,0",
    8: "255,165,0",
    9: "139,26,26",
    10: "255,0,0",
    20: "0,255,255",
    30: "32,178,170",
    40: "0,255,0",
    50: "34,139,34",
    60: "0,100,0",
    70: "75,0,130",
    80: "139,0,139",
    90: "148,0,211",
    100: "199,21,133",
    110: "255,105,180",
    120: "255,192,203",
}

df = pd.read_csv(
    snakemake.input.cn_bed, sep="\t", header=None, names=["chr", "pos", "end", "cn"]
)
df["cn_round"] = df["cn"].round(0).astype("int32")
# df.apply(lambda row: int(row["cn"]), axis=1)
df["mapq"] = "0"
df["strand"] = "+"
df["big_start"] = "0"
df["big_end"] = "0"
df["color"] = df.apply(
    lambda row: color_hash[max(0, int(round(row["cn"])))]
    if row["cn"] < 10
    else color_hash[min(120, int(math.floor(round(row["cn"]) / 10.0) * 10))],
    axis=1,
)
df.sort_values(["chr", "pos"], inplace=True)
df[
    [
        "chr",
        "pos",
        "end",
        "cn_round",
        "mapq",
        "strand",
        "big_start",
        "big_end",
        "color",
        "cn",
    ]
].to_csv(snakemake.output.bed9, sep="\t", index=False, header=False, compression="gzip")

Python Pandas From line 1 of scripts/make_bed9.py

track_db_header = """
track {type}_cn
compositeTrack off
shortLabel {type_title} CN
longLabel  {type_title} copy number estimates
visibility hide
priority 30
type bigBed 9 +
itemRgb on
maxItems 100000
html bigbed/description.html
"""

hub = """
hub {type_title}_CN
shortLabel {type_title} CN
longLabel {type_title} copy number estimates
genomesFile genomes.txt
email mvollger.edu
"""
genomes = """
genome {sample}
trackDb trackDb.{sample}.txt
"""

track = """
    track {sm}_{type}
    parent {type}_cn
    bigDataUrl bigbed/{sm}_{type}.bb
    shortLabel {sm} {type} CN
    longLabel {sm} Copy Number
    type bigBed 9 +
    itemRgb on
    visibility dense
"""

html = """
<html>
<h3>Description</h3>
This track represents copy number estimates. Copy number is estimated over 500 bp windows of uniquely mappable sequence. Sequences are colored from cold to hot (0 - 120+) and exact copy can be found by clicking on the region of interest. \n\n 

<h3>Code Availability</h3>
<a href="https://github.com/mrvollger/fastCN-smk">GitHub</a>

<h3>Copy Number Key</h3>
{cn_key}

<h3>Credits</h3>
Please feel free to contact <a href=mailto:[email protected] >William Harvey</a> or <a href=mailto:[email protected]>Mitchell Vollger</a> with any questions and/or concerns regarding this track.

<h3>References</h3>
Bailey JA, Gu Z, Clark RA, Reinert K, Samonte RV, Schwartz S, Adams MD, Myers EW, Li PW, Eichler EE. Recent segmental duplications in the human genome. Science 2002
<br><br>

Pendleton AL, Shen F, Taravella AM, Emery S, Veeramah KR, Boyko AR, Kidd JM. Comparison of village dog and wolf genomes highlights the role of the neural crest in dog domestication. BMC Biol. 2018 
<br><br>

Sudmant PH, Mallick S, Nelson BJ, Hormozdiari F, Krumm N, Huddleston J, et al. Global diversity, population stratification, and selection of human copy-number variation. Science. 2015
<br><br>

Sudmant PH, Kitzman JO, Antonacci F, Alkan C, Malig M, Tsalenko A, et al. Diversity of human copy number. Science. 2010

<h3>Sample table</h3>
{sample_table}

<br><br>
</html>
"""

color_hash = {
    0: "229,229,229",
    1: "196,196,196",
    2: "0,0,0",
    3: "0,0,205",
    4: "65,105,225",
    5: "100,149,237",
    6: "180,238,180",
    7: "255,255,0",
    8: "255,165,0",
    9: "139,26,26",
    10: "255,0,0",
    20: "0,255,255",
    30: "32,178,170",
    40: "0,255,0",
    50: "34,139,34",
    60: "0,100,0",
    70: "75,0,130",
    80: "139,0,139",
    90: "148,0,211",
    100: "199,21,133",
    110: "255,105,180",
    120: "255,192,203",
}


def html_table(color_dict, h1, h2, rgb=True):
    rtn = '<table border="1">'
    rtn += f"<tr><th>{h1}</th><th>{h2}</th></tr>"
    for key, value in color_dict.items():
        if rgb:
            second = f'<div style="font-size:15px; color:rgb({value})">&#9632;</div>'
            # bgcolor= td
        else:
            second = value
        rtn += "<tr>"
        rtn += f'<td style="text-align: center; vertical-align: middle;">{key}</td>'
        rtn += f'<td style="background: rgb({value}); text-align: center; vertical-align: middle;">{second}</td>'
        rtn += "</tr>"
    rtn += "</table>"
    return rtn


with open(snakemake.output.track, "w") as out:
    out.write(
        track_db_header.format(
            type=snakemake.wildcards.type, type_title=snakemake.wildcards.type.upper()
        )
    )
    [
        out.write(track.format(sm=sm, type=snakemake.wildcards.type))
        for sm in snakemake.params.samples
    ]

open(snakemake.output.hub, "w").write(
    hub.format(type_title=snakemake.wildcards.type.upper())
)
open(snakemake.output.genomes, "w").write(
    genomes.format(sample=snakemake.wildcards.sample)
)

sample_table = dict(zip(snakemake.params.samples, snakemake.params.reads))

open(snakemake.output.html, "w").write(
    html.format(
        cn_key=html_table(color_hash, h1="Copy number", h2="Color"),
        sample_table=html_table(sample_table, h1="Sample", h2="Read file", rgb=False),
    )
)

Python From line 1 of scripts/make_trackdb.py

import gzip

file = gzip.open(snakemake.input.comp, "rt")

# Consume header
line = file.readline()

with open(snakemake.output.exp, "w") as out_file:
    out_file.write("@\n")
    while True:
        line = file.readline().rstrip()
        if not line:
            break
        else:
            line = line.split(",")
            out_file.write("\n".join(["\t".join(line)] * int(line[0])))
            out_file.write("\n")

file.close()

Python From line 6 of scripts/unpack_partial_sam.py

import argparse
import os
import sys
import pysam
from numba import njit
from functools import partial

# import multiprocessing


@njit
def make_dynamic_windows(name, seq, window):
    window_count = 0
    window_start = 0
    window_end = 0
    out = []
    for char in seq:
        if char.upper() != "N":
            window_count += 1
        window_end += 1

        if window_count == window:
            out.append(
                name
                + "\t"
                + str(window_start)
                + "\t"
                + str(window_end)
                + "\t"
                + str(window_count)
                + "\n"
            )
            window_start = window_end
            window_count = 0
            # print("here")
    out.append(
        name
        + "\t"
        + str(window_start)
        + "\t"
        + str(window_end)
        + "\t"
        + str(window_count)
        + "\n"
    )
    return out


# global var for inputs
args = None

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument(
        "--infile", help="positional input", default=snakemake.input.fasta
    )
    parser.add_argument(
        "--window",
        help="length of window in non masked bases",
        type=int,
        default=snakemake.params.window,
    )
    parser.add_argument(
        "--outfile", help="positional output bed", default=snakemake.output.bed
    )
    args = parser.parse_args()
    out = open(args.outfile, "a")
    for rec in pysam.FastxFile(args.infile):
        rtn = make_dynamic_windows(rec.name, rec.sequence, args.window)
        for line in rtn:
            out.write(line)
    out.close()