Metapangenomics Workflow for Amino Acid K-mers Comparison

public 1yr ago Version: v1.0 0 bookmarks

View Workflow

2021-metapangenome-example — View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

This repository implements an example workflow for performing metapangenomics using amino acid k-mers, and compares this pipeline against existing metapangenome approaches. The results of this workflow are written up at github.com/dib-lab/2021-paper-metapangenomes.

Getting started

The workflow is written in snakemake with software managed by conda. The workflow can be re-run with the following (snakemake command assumes a slurm cluster, and executes as a dry run first).

conda env create --name orph --file environment.yml
conda activate orph
snakemake -j 16 --use-conda --rerun-incomplete --latency-wait 15 --resources mem_mb=200000 --cluster "sbatch -t 120 -J metap -p bmm -n 1 -N 1 -c {threads} --mem={resources.mem_mb}" -k -n

Code Snippets

library(dplyr)
library(readr)
library(purrr)
library(ggplot2)
library(tidyr)
library(aplot)

# read in metadata and results --------------------------------------------

destfile <- "inputs/gtdb-rs202.taxonomy.v2.csv"
url <- "https://osf.io/p6z3w/download"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto") 
}
gtdb_lineages <- read_csv(destfile)

destfile <- "inputs/hmp2_metadata.csv"
url <- "https://ibdmdb.org/tunnel/products/HMP2/Metadata/hmp2_metadata.csv"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto") 
}
hmp_metadata <- read_csv(destfile)
h4017 <- hmp_metadata %>%
  select(participant_id = "Participant ID", data_id = "External ID", data_type, 
         week_num, diagnosis, antibiotics = "Antibiotics") %>%
  filter(data_type == "metagenomics") %>%
  filter(data_id %in% c('HSM67VF9', 'HSM67VFD', 'HSM67VFJ', 'HSM6XRQB',
                        'HSM6XRQI', 'HSM6XRQK', 'HSM6XRQM', 'HSM6XRQO',
                        'HSM7CYY7', 'HSM7CYY9', 'HSM7CYYB', 'HSM7CYYD'))

#gather_results <- unlist(snakemake@input[['gather']]) %>%
gather_results <- Sys.glob("outputs/sample_gather/*genomic.csv") %>%
  set_names() %>%
  map_dfr(read_csv, col_types = c("dddddlllcccddddcccd"), .id = "sample") %>%
  mutate(sample = gsub("outputs/sample_gather/", "", sample)) %>%
  mutate(sample = gsub("_gather_gtdb-rs202-genomic.csv", "", sample)) %>%
  separate(col = name, into = c("accession"), remove = F, sep = " ", extra = "drop")

gather_results <- left_join(gather_results, gtdb_lineages, by = c("accession" = "ident")) %>%
  left_join(h4017, by = c("sample" = "data_id"))

# plot --------------------------------------------------------------------
h4017$antibiotic <- c("ciprofloxacin", "ciprofloxacin", "ciprofloxacin",
                      "none", "none", "metronidazole", "metronidazole",
                      "metronidazole", "none", "none", "none", "unknown")
h4017$antibiotic <- factor(h4017$antibiotic, levels =  c("ciprofloxacin", "metronidazole", "unknown", "none"))
abx_plt <- ggplot(h4017, aes(x = week_num, y = diagnosis, shape = antibiotic)) +
  geom_point(size = 2.5) +
  theme_minimal() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 8),
        axis.text = element_blank(),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 7),
        panel.grid = element_blank())+
  labs(x = "week number", y = "antibiotic") 
abx_plt

# decide which genome to query with ---------------------------------------

common_species <- gather_results %>%
  select(sample, species) %>%
  distinct() %>%
  group_by(species) %>%
  tally() %>%
  filter(n == 12)

# common_species_plt <- ggplot(gather_results %>%
#                                filter(species %in% common_species$species),
#                              aes(x = week_num, y = f_unique_to_query)) +
#   geom_col() +
#   facet_wrap(~species) +
#   theme_minimal() 

sgc_species <- gather_results %>%
  filter(species %in% common_species$species) %>%
  group_by(species) %>%
  summarize(sum_f_unique_to_query = sum(f_unique_to_query)) %>%
  filter(sum_f_unique_to_query > 0.2)

gather_results2 <- gather_results %>%
  mutate(species2 = ifelse(species %in% sgc_species$species, species, "other")) %>%
  mutate(species2 = gsub("s__", "", species2))
gather_results2$species2 <- factor(gather_results2$species2, 
                                   levels =c('other', 
                                             'Bacteroides fragilis',
                                             'Bacteroides uniformis',
                                             'Enterocloster bolteae',
                                             'Parabacteroides distasonis',
                                             'Parabacteroides merdae',
                                             'Phocaeicola vulgatus'))
gather_results2 <- left_join(gather_results2, h4017)
sgc_plt <- ggplot(gather_results2,
                     aes(x = week_num, y = f_unique_to_query, fill = species2)) +
  geom_col() +
  geom_point(aes(x = week_num, y = 1, shape = antibiotic)) +
  labs(x = "week number", y = "fraction of metagenome", fill = "species")+
  theme_minimal() +
  theme(legend.title = element_text(size = 8),
        legend.text = element_text(size = 7, face = "italic"),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 7)) +
  ylim(0, 1) + 
  scale_fill_manual(values = c("#999999", "#E69F00", "#56B4E9", "#F0E442", "#009E73",
                               "#CC79A7", "#0072B2")) +
  guides(fill = guide_legend(override.aes = list(shape = NA)))
sgc_plt

pdf("figures/common_species_breakdown2.pdf", width = 6, height = 3.6)
#pdf(snakemake@output[['pdf']], width = 6, height = 3)
#abx_plt %>% insert_bottom(sgc_plt, height = 5)
sgc_plt
dev.off()

# output sgc queries as gather file ---------------------------------------

# get rep accession
sgc_genomes <- gather_results %>%
  filter(species %in% sgc_species$species) %>%
  group_by(name, species) %>%
  summarize(sum_f_unique_to_query = sum(f_unique_to_query)) %>%
  ungroup() %>%
  group_by(species) %>%
  arrange(desc(sum_f_unique_to_query)) %>%
  slice(1)

gather_sgc_queries <- unlist(snakemake@input[['gather']])[6] %>%
#gather_sgc_queries <- Sys.glob("outputs/sample_gather/*genomic.csv")[6] %>%
  map_dfr(read_csv, col_types = c("dddddlllcccddddcccd")) %>%
  filter(name %in% sgc_genomes$name)

write_csv(gather_sgc_queries, snakemake@output[['gather_grist']])

R ggplot2 dplyr tidyr readr purrr aplot From line 1 of scripts/gather_gtdb_to_sgc_queries.R

import sys
import argparse
import urllib.request
import csv

from lxml import etree


def url_for_accession(accession):
    db, acc = accession.strip().split("_")
    if '.' in acc:
        number, version = acc.split(".")
    else:
        number, version = acc, '1'
    number = "/".join([number[p : p + 3] for p in range(0, len(number), 3)])
    url = f"https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{number}"
    print(f"opening directory: {url}", file=sys.stderr)

    with urllib.request.urlopen(url) as response:
        all_names = response.read()

    print("done!", file=sys.stderr)

    all_names = all_names.decode("utf-8")

    full_name = None
    for line in all_names.splitlines():
        if line.startswith(f'<a href='):
            name=line.split('"')[1][:-1]
            db_, acc_, *_ = name.split("_")
            if db_ == db and acc_.startswith(acc):
                full_name = name
                break

    if full_name is None:
        return None
    else:
        url = "htt" + url[3:]
        return (
            f"{url}/{full_name}/{full_name}_genomic.fna.gz",
            f"{url}/{full_name}/{full_name}_assembly_report.txt",
        )


def get_taxid_from_assembly_report(url):
    print(f"opening assembly report: {url}", file=sys.stderr)
    with urllib.request.urlopen(url) as response:
        content = response.read()
    print("done!", file=sys.stderr)

    content = content.decode("utf-8").splitlines()
    for line in content:
        if "Taxid:" in line:
            line = line.strip()
            pos = line.find("Taxid:")
            assert pos >= 0
            pos += len("Taxid:")
            taxid = line[pos:]
            taxid = taxid.strip()
            return taxid

    assert 0


def get_tax_name_for_taxid(taxid):
    tax_url = (
        f"https://www.ncbi.nlm.nih.gov/taxonomy/?term={taxid}&report=taxon&format=text"
    )
    print(f"opening tax url: {tax_url}", file=sys.stderr)
    with urllib.request.urlopen(tax_url) as response:
        content = response.read()

    print("done!", file=sys.stderr)

    root = etree.fromstring(content)
    notags = etree.tostring(root).decode("utf-8")
    if notags.startswith("<pre>"):
        notags = notags[5:]
    if notags.endswith("</pre>"):
        notags = notags[:-6]
    notags = notags.strip()

    return notags


def main():
    p = argparse.ArgumentParser()
    p.add_argument("accession")
    p.add_argument("-o", "--output")
    args = p.parse_args()

    fieldnames = ["acc", "genome_url", "assembly_report_url", "ncbi_tax_name"]
    fp = None
    if args.output:
        fp = open(args.output, "wt")
        w = csv.DictWriter(fp, fieldnames=fieldnames)
    else:
        w = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
    w.writeheader()

    acc = args.accession

    genome_url, assembly_report_url = url_for_accession(acc)
    taxid = get_taxid_from_assembly_report(assembly_report_url)
    tax_name = get_tax_name_for_taxid(taxid)

    d = dict(
        acc=acc,
        genome_url=genome_url,
        assembly_report_url=assembly_report_url,
        ncbi_tax_name=tax_name,
    )

    w.writerow(d)
    print(f"retrieved for {acc} - {tax_name}", file=sys.stderr)

    if fp:
        fp.close()

    return 0


if __name__ == "__main__":
    sys.exit(main())

Python lxml From line 6 of scripts/genbank_genomes.py

library(dplyr)
library(readr)
library(purrr)
library(tidyr)

gtdb_lineages <- read_csv(snakemake@input[['gtdb_lineages']])

charcoal_lineages <- Sys.glob(paste0("outputs/metabat2_gather/", snakemake@wildcards[['sample']], "_bin.*_gather_gtdb-rs202-genomic.csv")) %>%
#charcoal_lineages <- unlist(snakemake@input[['gather']]) %>% 
  map_dfr(read_csv, col_types = "dddddlllcccddddcccd") %>%
  filter(gather_result_rank == 0) %>%
  select(query_name, name) %>%
  mutate(query_name = gsub(".*\\.", "", query_name),
         query_name = gsub("bin", "bin.", query_name),
         query_name = paste0(query_name, ".fa"),
         accession = gsub(" .*", "", name)) %>%
  left_join(gtdb_lineages, by = c("accession" = "ident")) %>%
  select(-name, -accession)

write_csv(charcoal_lineages, snakemake@output[['charcoal_lineages']])

R dplyr tidyr readr purrr From line 1 of scripts/generate_charcoal_lineages.R

install.packages("pagoo", repos="http://cran.us.r-project.org")
library(pagoo)
file.create(snakemake@output[['pagoo']])

R Pagoo From line 1 of scripts/install_pagoo.R

library(readr)
library(dplyr)

#path <- "outputs/nbhd_sketch_tables_species/GCA_000162535.1-s__Parabacteroides_distasonis_long.csv"
#sketch_table <- read_csv(path)

sketch_table <- read_csv(snakemake@input[['csv']])
# filter samples that don't have enough k-mers for the species
# (using 10000 as threshold)
sketch_table_grp <- sketch_table %>%
  group_by(acc) %>%
  tally()

keep <- sketch_table_grp %>%
  filter(n > 10000)

pg_sigs <- sketch_table %>%
  filter(acc %in% keep$acc) %>%
  select(acc) %>%
  distinct() %>%
  mutate(acc = gsub("\\.G", "-G", acc), 
         acc = gsub("\\.s", "-s", acc),
         acc = paste0("outputs/nbhd_sigs_species/", acc, ".sig"))

write_tsv(pg_sigs, snakemake@output[['lst']], col_names = FALSE)

R dplyr readr From line 1 of scripts/kaa_generate_list_of_sigs_to_intersect.R

library(readr)
library(dplyr)

files <- read_tsv(snakemake@input[['txt']], col_names = "bins")
files <- files %>%
  mutate(bins = gsub("_sigs", "", bins),
         bins = gsub("\\.sig", "\\.ffn", bins))

write_tsv(files, snakemake@output[['txt']], col_names = F)

R dplyr readr From line 1 of scripts/metabat_generate_lists_of_bins_of_same_species_prokka_ffn.R

library(readr)
library(dplyr)
library(tidyr)
library(purrr)

gtdb_lineages <- read_csv(snakemake@input[['lineages']])

# this specified as a sysglob to not mess up the sample wildcard in the workflow
gather_bins <- Sys.glob("outputs/metabat2_gather/*csv") %>%
#gather_bins <- unlist(snakemake@input[['gather']]) %>%
  map_dfr(read_csv, col_types = c("dddddlllcccddddcccd")) %>%
  separate(col = name, into = c("accession"), remove = F, sep = " ", extra = "drop") %>%
  mutate(sample = basename(query_filename)) %>%
  mutate(sample = gsub("-.*", "", sample)) %>%
  left_join(gtdb_lineages, by = c("accession" = "ident"))

species_of_interest <- read_csv(unlist(snakemake@input[['acc_to_db']])) %>%
  #filter(!species %in% c("s__Ruminococcus_B_gnavus", "s__Clostridium_Q_symbiosum", "s__Roseburia_intestinalis")) %>%
  mutate(species = gsub("(s__.*?)_", "\\1 ", species))

acc_to_db <- read_csv(unlist(snakemake@input[['acc_to_db']])) %>%
  #filter(!species %in% c("s__Ruminococcus_B_gnavus", "s__Clostridium_Q_symbiosum", "s__Roseburia_intestinalis")) %>%
  mutate(acc_to_db = paste0(accession, "-", species)) %>%
  mutate(species = gsub("(s__.*?)_", "\\1 ", species))

# only consider best match for the bin
gather_bins <- gather_bins %>%
  filter(gather_result_rank == 0)

named_bins <- gather_bins %>% 
  left_join(acc_to_db, by = "species") %>%
  select(acc_to_db, query_filename) %>%
  mutate(sig_filename = gsub("metabat2", "metabat2_prokka_sigs", query_filename),
         sig_filename = gsub("/bin", "_bin", sig_filename),
         sig_filename = gsub(".fa", ".sig", sig_filename)) %>%
  select(acc_to_db, sig_filename)

# filter to single acc_db using wildcard
# and only write out sig path
tmp <- named_bins %>%
  filter(acc_to_db %in% snakemake@wildcards[["acc_db"]]) %>%
  select(sig_filename)

write_tsv(tmp, snakemake@output[['txt']], col_names = FALSE)

R dplyr tidyr readr purrr MetaBAT 2 From line 1 of scripts/metabat_generate_lists_of_bins_of_same_species.R

library(pagoo)
library(readr)
library(ggplot2)
library(dplyr)

read_long_sketch_table_as_pagoo <- function(path, threshold = 2000){
  sketch_table <- read_csv(path)

  # filter samples that don't have enough k-mers for the species
  sketch_table_grp <- sketch_table %>%
    group_by(acc) %>%
    tally()

  print(sketch_table_grp)

  keep <- sketch_table_grp %>%
    filter(n > threshold)

  sketch_table <- sketch_table %>%
    filter(acc %in% keep$acc) %>%
    select(gene    = minhash, 
           org     = acc,
           cluster = minhash) 
  p <- pagoo(data = as.data.frame(sketch_table))
  return(p)
}

#list.files("outputs/nbhd_sketch_tables_species/")
#species_string1 <- ""
species_string1 <- snakemake@wildcards[['acc_db']]
species_string2 <- gsub("-", ".", species_string1)
species_string3 <- gsub(".*s__", "", species_string2)
species_string3 <- gsub("_", " ", species_string3)

pg <- read_long_sketch_table_as_pagoo(snakemake@input[['csv']], threshold = 0)
#pg <- read_long_sketch_table_as_pagoo(paste0("outputs/nbhd_sketch_tables_species/", 
#                                             species_string1, "_long.csv"), threshold = 0)
pg_organisms <- gsub(paste0(".", species_string2), "", as.character(pg$organisms$org))

destfile <- "inputs/hmp2_metadata.csv"
url <- "https://ibdmdb.org/tunnel/products/HMP2/Metadata/hmp2_metadata.csv"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto")
}
hmp_metadata <- read_csv(destfile)

# serology data_type contains information on abx types
# tmp <- hmp_metadata %>%
#   select(participant_id = "Participant ID", data_id = "External ID", data_type,
#          week_num, diagnosis, antibiotics = "Antibiotics",
#          fecalcal, metronidazole = "Flagyl (Metronidazole)",
#          cipro = "Cipro (Ciprofloxin)", rifaxamin = "Xifaxin (rifaxamin)",
#          levaquin = Levaquin, other_abx = "Other Antibiotic:") %>%
#   filter(participant_id =="H4017")

h4017 <- hmp_metadata %>%
  select(data_id = "External ID", data_type,
         week_num, diagnosis, antibiotics = "Antibiotics") %>%
  filter(data_type == "metagenomics") %>%
  filter(data_id %in% c('HSM67VF9', 'HSM67VFD', 'HSM67VFJ', 'HSM6XRQB',
                        'HSM6XRQI', 'HSM6XRQK', 'HSM6XRQM', 'HSM6XRQO',
                        'HSM7CYY7', 'HSM7CYY9', 'HSM7CYYB', 'HSM7CYYD'))

h4017 <- h4017 %>%
  mutate(org = paste0(data_id, ".", species_string2)) %>%
  filter(data_id %in% pg_organisms)

pg$add_metadata(map = "org", as.data.frame(h4017))

pdf(snakemake@output[['pca']], height = 3, width = 4)
pg$gg_pca() +
  geom_point(aes(color = antibiotics)) +
  theme_minimal() +
  theme(plot.title = element_text(face = "italic")) +
  labs(title = species_string3) +
  scale_color_manual(values = c("#1F78B4", "#33A02C"))
dev.off()

# try color binmap --------------------------------------------------------

tpm <- t(pg$pan_matrix)
tpm[which(tpm > 0, arr.ind = TRUE)] <- 1L
bm <- as.data.frame(tpm)
or <- order(rowSums(bm), decreasing = TRUE)
lvls <- rownames(bm)[or]
bm$Cluster <- factor(rownames(bm), levels = lvls)
bm <- reshape2::melt(bm, 'Cluster')
bm$value <- factor(bm$value, levels = c(1, 0))
bm <- left_join(bm, h4017, by = c("variable" = "org"))
bm$value_abx <- paste0(bm$value, "_", bm$antibiotics)
bm$value_abx <- factor(bm$value_abx, levels = c("0_No", "1_No", "0_Yes",  "1_Yes"))
colnames(bm)[which(colnames(bm) == 'variable')] <- "Organism"

#pdf(paste0("outputs/pagoo_species/", species_string1, "_binmap.pdf"), height = 3, width = 4)
pdf(snakemake@output[['binmap']], height = 3, width = 4)
ggplot(bm, aes(Cluster, as.factor(week_num), fill=value_abx)) +
  geom_raster() +
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        plot.title = element_text(face = "italic")) +
  #scale_fill_grey(start = .2, end = .9)
  scale_fill_brewer(palette = "Paired", labels = c("0, No", "1, No", "0, Yes", "1, Yes")) +
  labs(x = "protein k-mer", y = "week number", fill = "presence\n& antibiotics",
       title = species_string3)
dev.off()

R ggplot2 dplyr readr Pagoo From line 1 of scripts/pagoo_plt.R

library(dplyr)
library(readr)

#lineages <- read_csv("inputs/gtdb-rs202.taxonomy.v2.csv")
lineages <- read_csv(snakemake@input[['lineages']])

gather <- read_csv(snakemake@input[['gather']]) %>%
#gather <- read_csv("outputs/genbank/gather_gtdb-rs202-genomic.x.genbank.gather.csv") %>%
  mutate(accession = gsub(" .*", "", name)) %>%
  left_join(lineages, by = c("accession" = "ident")) %>%
  select(accession, species) %>%
  #mutate(species = gsub(" ", "_", species),
  #       database = paste0("gtdb-rs202.", species, ".protein-k10.nodegraph"))
  mutate(species = gsub(" ", "_", species))

write_csv(gather, snakemake@output[['csv']])

R dplyr readr From line 1 of scripts/query_to_species_db.R

library(dplyr)
library(readr)

read_names <- read_tsv(snakemake@input[['names']], col_names = c("names")) %>%
  mutate(names = gsub("^@[0-9]*:[0-5]:[0-5]:", "", names))

write_tsv(read_names, snakemake@output[["lst"]], col_names = FALSE)

R dplyr readr From line 1 of scripts/rm_unmapped_read_name_prefixes.R

from sourmash import signature
import pandas as pd
import os
import sys
import argparse

def main():
    p = argparse.ArgumentParser()
    p.add_argument('signature')       # sourmash signature
    p.add_argument('output')          # output csv file name
    args = p.parse_args()

    # load the signature from disk
    sigfp = open(args.signature, 'rt')
    siglist = list(signature.load_signatures(sigfp))
    loaded_sig = siglist[0]

    # Get the minhashes
    mins = loaded_sig.minhash.hashes.keys()

    name = os.path.basename(args.signature)
    df = pd.DataFrame(mins, columns=[name])

    # write to a csv
    df.to_csv(args.output, index = False)

if __name__ == '__main__':
    sys.exit(main())

Python Pandas From line 8 of scripts/sig_to_csv.py

files <- snakemake@input
files <- unlist(files, use.names=FALSE)

## read files into list with each row labelled by accession. 
tab_long <- list()
for(i in 1:length(files)){
  print(i)
  sig <- read.csv(files[i])           # read in signature csv as df
  acc <- colnames(sig)[1]             # set lib name using sample
  acc <- gsub("*.sig", "", acc)       # remove file suffix
  sig$acc <- acc                      # set libname as col
  colnames(sig) <- c("minhash", "acc")
  tab_long[[i]] <- sig
}

## bind into one dataframe
tab_long <- do.call(rbind, tab_long)
tab_long$present <- 1 # add a col with "1" for presence/absence
write.csv(tab_long, snakemake@output[['csv']],
          quote = F, row.names = F)

R From line 2 of scripts/sketch_csv_to_long.R

shell:'''
fastp --in1 {input.R1} \
  --in2 {input.R2} \
  --out1 {output.R1} \
  --out2 {output.R2} \
  --detect_adapter_for_pe \
  --qualified_quality_phred 4 \
  --length_required 31 --correction \
  --json {output.json} \
  --html {output.html}
'''

SnakeMake fastp From line 133 of main/Snakefile

shell:'''
bbduk.sh -Xmx64g t=3 in={input.r1} in2={input.r2} out={output.r1} out2={output.r2} outm={output.human_r1} outm2={output.human_r2} k=31 ref={input.human}
'''

SnakeMake BBMap From line 163 of main/Snakefile

shell:'''
interleave-reads.py {input} | trim-low-abund.py --gzip -C 3 -Z 18 -M 60e9 -V - -o {output}
'''

SnakeMake From line 178 of main/Snakefile

shell:'''
repair.sh in1={input} out1={output.r1} out2={output.r2} outs={output.singleton} repair
'''

SnakeMake From line 198 of main/Snakefile

shell:'''
megahit -1 {input.r1} -2 {input.r2} --out-dir {output.tmp_dir} --out-prefix {wildcards.sample}
mv {output.tmp_dir}/{wildcards.sample}.contigs.fa {output.fa}
'''

SnakeMake MEGAHIT From line 214 of main/Snakefile

shell:'''
bowtie2-build {input} {input}    
'''

SnakeMake Bowtie 2 From line 227 of main/Snakefile

shell:'''
bowtie2 -x {input.fa} -1 {input.r1} -2 {input.r2} |
    samtools view -bS -o {output}
'''

SnakeMake SAMtools Bowtie 2 From line 243 of main/Snakefile

shell:'''
samtools sort {input} -o {output}
'''

SnakeMake SAMtools From line 256 of main/Snakefile

shell:'''
samtools index {input}
'''

SnakeMake SAMtools From line 268 of main/Snakefile

shell:'''
jgi_summarize_bam_contig_depths --outputDepth {output} {input.bam}
'''

SnakeMake From line 282 of main/Snakefile

shell:'''
metabat2 -m 1500 -i {input.fa} --abdFile {input.depth} -o {params.outdir}
'''

SnakeMake MetaBAT 2 From line 297 of main/Snakefile

shell:"""
sourmash sketch dna -p k=31,scaled=2000 -o {output} --name {wildcards.sample}.bin{wildcards.binn} {input}
"""

SnakeMake From line 309 of main/Snakefile

shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --scaled 2000 -k 31 {input.sig} {input.db} 
'''

SnakeMake From line 324 of main/Snakefile

script: "scripts/generate_charcoal_lineages.R"

SnakeMake From line 347 of main/Snakefile

shell:'''
ls {params.genome_dir}/*fa | xargs -n 1 basename > {output} 
'''

SnakeMake From line 364 of main/Snakefile

run:
    with open(output.conf, 'wt') as fp:
        print(f"""\

SnakeMake From line 383 of main/Snakefile

shell:'''
python -m charcoal run {input.conf} -j {threads} clean --nolock --latency-wait 15 --rerun-incomplete --use-conda
'''

SnakeMake From line 405 of main/Snakefile

shell:'''
gunzip -c {input} > {output}
'''

SnakeMake From line 416 of main/Snakefile

shell:'''
prokka {input} --outdir {params.outdir} --prefix {wildcards.sample}_bin.{wildcards.bin} \
    --metagenome --force --locustag {wildcards.sample}_bin.{wildcards.bin} --cpus {threads} --centre X --compliant
'''

SnakeMake metaprokka From line 430 of main/Snakefile

shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.sample}_bin.{wildcards.bin} {input}
"""

SnakeMake From line 443 of main/Snakefile

script: "scripts/metabat_generate_lists_of_bins_of_same_species.R"

SnakeMake From line 470 of main/Snakefile

shell:"""
sourmash sig intersect -o {output} --from-file {input} 
"""

SnakeMake From line 480 of main/Snakefile

shell:"""
sourmash sig rename -o {output} {input} {wildcards.acc_db}_metabat2_core
"""

SnakeMake From line 492 of main/Snakefile

shell:"""
sourmash sig merge --name {wildcards.acc_db}_metabat2_all -o {output} --from-file {input} 
"""

SnakeMake From line 504 of main/Snakefile

shell:"""
python scripts/sig_to_csv.py {input} {output}
"""

SnakeMake From line 516 of main/Snakefile

shell:"""
sourmash sketch dna -p k=31,scaled=2000 -o {output} --name {wildcards.sample} {input}
"""

SnakeMake From line 533 of main/Snakefile

shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --output-unassigned {output.un} --scaled 2000 -k 31 {input.sig} {input.db} 
'''

SnakeMake From line 551 of main/Snakefile

shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --output-unassigned {output.un} --scaled 2000 -k 31 {input.sig} {input.db} 
'''

SnakeMake From line 569 of main/Snakefile

script: "scripts/gather_gtdb_to_sgc_queries.R"

SnakeMake From line 586 of main/Snakefile

shell: """
    python scripts/genbank_genomes.py {wildcards.acc} \
        --output {output.csvfile}
"""

SnakeMake From line 619 of main/Snakefile

run:
    with open(input.csvfile, 'rt') as infp:
        r = csv.DictReader(infp)
        rows = list(r)
        assert len(rows) == 1
        row = rows[0]
        acc = row['acc']
        assert wildcards.acc.startswith(acc)
        url = row['genome_url']
        name = row['ncbi_tax_name']

        print(f"downloading genome for acc {acc}/{name} from NCBI...", file=sys.stderr)
        with open(output.genome, 'wb') as outfp:
            with urllib.request.urlopen(url) as response:
                content = response.read()
                outfp.write(content)
                print(f"...wrote {len(content)} bytes to {output.genome}",
                    file=sys.stderr)

SnakeMake From line 647 of main/Snakefile

run:
    # something weird is happening here where snakemake solves the inputs correctly, but 
    # "queries" ends up being the gather csv file. So...parse the CSV file for accession
    # numbers. Sigh.
    gather_csv = input[0]
    genome_accs = []
    with open(gather_csv, 'rt') as fp:
        r = csv.DictReader(fp)
        for row in r:
            acc = row['name'].split(' ')[0]
            acc = "outputs/genbank_genomes/" + acc + "_genomic.fna.gz"
            genome_accs.append(acc)

    query_list = "\n- ".join(genome_accs)
    #query_list = "\n- ".join(input.queries)
    with open(output.conf, 'wt') as fp:
       print(f"""\

SnakeMake Snakemake From line 675 of main/Snakefile

shell:'''
python -m spacegraphcats run {input.conf} extract_reads --nolock --outdir={params.outdir} --rerun-incomplete 
'''

SnakeMake From line 716 of main/Snakefile

shell:'''
ls {output}
'''

SnakeMake From line 727 of main/Snakefile

shell:"""
sourmash sketch dna -p k=31,scaled=2000 -o {output} --name {wildcards.acc} {input}
"""

SnakeMake From line 740 of main/Snakefile

shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --output-unassigned {output.un} --scaled 2000 -k 31 {input.sig} {input.db} 
'''

SnakeMake From line 758 of main/Snakefile

shell:'''
orpheum translate --alphabet protein --peptide-ksize 10  --peptides-are-bloom-filter --noncoding-nucleotide-fasta {output.nuc_noncoding} --coding-nucleotide-fasta {output.nuc} --csv {output.csv} --json-summary {output.json} {input.ref} {input.fasta} > {output.pep}
'''

SnakeMake From line 778 of main/Snakefile

shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc} {input}
"""

SnakeMake From line 791 of main/Snakefile

shell:'''
python scripts/sig_to_csv.py {input} {output}
'''

SnakeMake From line 804 of main/Snakefile

script: "scripts/sketch_csv_to_long.R"

SnakeMake From line 818 of main/Snakefile

script: "scripts/query_to_species_db.R"

SnakeMake From line 838 of main/Snakefile

shell:'''
orpheum translate --jaccard-threshold 0.39 --alphabet protein --peptide-ksize 10  --peptides-are-bloom-filter --noncoding-nucleotide-fasta {output.nuc_noncoding} --coding-nucleotide-fasta {output.nuc} --csv {output.csv} --json-summary {output.json} {input.ref} {input.fasta} > {output.pep}
'''

SnakeMake From line 856 of main/Snakefile

shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc_db} {input}
"""

SnakeMake From line 869 of main/Snakefile

shell:'''
python scripts/sig_to_csv.py {input} {output}
'''

SnakeMake From line 882 of main/Snakefile

script: "scripts/sketch_csv_to_long.R"

SnakeMake From line 896 of main/Snakefile

script: "scripts/install_pagoo.R"

SnakeMake From line 906 of main/Snakefile

script: "scripts/pagoo_plt.R"

SnakeMake From line 921 of main/Snakefile

shell:"""
sourmash sketch dna -p k=51,scaled=2000 -o {output} --name {wildcards.acc_db} {input}
"""

SnakeMake From line 933 of main/Snakefile

shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --scaled 2000 -k 51 {input.sig} {input.db} 
'''

SnakeMake From line 950 of main/Snakefile

script: "scripts/kaa_generate_list_of_sigs_to_intersect.R"

SnakeMake From line 963 of main/Snakefile

shell:"""
sourmash sig intersect -o {output} --from-file {input} 
"""

SnakeMake From line 973 of main/Snakefile

shell:"""
sourmash sig rename -o {output} {input} {wildcards.acc_db}_kaa_core
"""

SnakeMake From line 985 of main/Snakefile

shell:"""
sourmash sig merge --name {wildcards.acc_db}_kaa_all -o {output} {input} 
"""

SnakeMake From line 998 of main/Snakefile

shell:"""
python scripts/sig_to_csv.py {input} {output}
"""

SnakeMake From line 1010 of main/Snakefile

shell:'''
sourmash compare -o {output.comp} --csv {output.csv} {input}
'''

SnakeMake From line 1031 of main/Snakefile

shell:'''
sourmash compare --max-containment -o {output.comp} --csv {output.csv} {input}
'''

SnakeMake From line 1048 of main/Snakefile

shell:'''
grep ">" {input} > {output}
'''

SnakeMake From line 1068 of main/Snakefile

script: "scripts/identify_core_gene_sequence_names.R"

SnakeMake From line 1083 of main/Snakefile

shell:'''
seqtk subseq {input.fa} {input.core_names} > {output}
'''

SnakeMake seqtk subSeq From line 1095 of main/Snakefile

shell:'''
transeq {input} {output}
'''

SnakeMake From line 1107 of main/Snakefile

shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc_db}_roary_core {input}
"""

SnakeMake From line 1119 of main/Snakefile

shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc_db}_roary_all {input}
"""

SnakeMake From line 1131 of main/Snakefile

shell:"""
python scripts/sig_to_csv.py {input} {output}
"""

SnakeMake From line 1143 of main/Snakefile

shell:"""
sourmash compare --csv {output} {input}
"""

SnakeMake From line 1162 of main/Snakefile

shell:"""
sourmash compare --containment --csv {output} {input}
"""

SnakeMake From line 1177 of main/Snakefile

shell:"""
sourmash compare --max-containment --csv {output} {input}
"""

SnakeMake From line 1192 of main/Snakefile

shell:"""
sourmash compare --csv {output} {input}
"""

SnakeMake From line 1213 of main/Snakefile

shell:"""
sourmash compare --containment --csv {output} {input}
"""

SnakeMake From line 1228 of main/Snakefile

shell:"""
sourmash compare --max-containment --csv {output} {input}
"""

SnakeMake From line 1243 of main/Snakefile

script: "scripts/metabat_generate_lists_of_bins_of_same_species_prokka_ffn.R"

SnakeMake From line 1259 of main/Snakefile

shell:'''
xargs cat < {input} > {output}
'''

SnakeMake From line 1268 of main/Snakefile

shell:'''
cd-hit-est -c .95 -d 0 -i {input} -o {output}
'''

SnakeMake cd-hit From line 1280 of main/Snakefile

shell:'''
bwa index {input}
''' 

SnakeMake BWA From line 1290 of main/Snakefile

shell:'''
bwa index {input}
''' 

SnakeMake BWA From line 1300 of main/Snakefile

shell:'''
bwa mem -p -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''

SnakeMake SAMtools BWA From line 1313 of main/Snakefile

shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''

SnakeMake SAMtools From line 1324 of main/Snakefile

shell:'''
samtools view -b -f 4 {input} > {output}
'''

SnakeMake SAMtools From line 1335 of main/Snakefile

shell:'''
samtools fastq -N -0 {output} {input}
'''

SnakeMake SAMtools From line 1347 of main/Snakefile

shell:'''
bwa mem -p -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''

SnakeMake SAMtools BWA From line 1360 of main/Snakefile

shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''

SnakeMake SAMtools From line 1371 of main/Snakefile

shell:'''
samtools view -b -f 4 {input} > {output}
'''

SnakeMake SAMtools From line 1382 of main/Snakefile

shell:'''
samtools fastq -N -0 {output} {input}
'''

SnakeMake SAMtools From line 1394 of main/Snakefile

shell:'''
transeq {input} {output}
'''

SnakeMake From line 1410 of main/Snakefile

shell:'''
paladin index -r3 {input}
'''

SnakeMake From line 1422 of main/Snakefile

shell:'''
transeq {input} {output}
'''

SnakeMake From line 1434 of main/Snakefile

shell:'''
paladin index -r3 {input}
'''

SnakeMake From line 1446 of main/Snakefile

shell:'''
paladin align -C -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''

SnakeMake SAMtools From line 1459 of main/Snakefile

shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''

SnakeMake SAMtools From line 1470 of main/Snakefile

shell:'''
samtools view -b -f 4 {input} > {output}
'''

SnakeMake SAMtools From line 1482 of main/Snakefile

shell:'''
samtools fastq -N -0 {output} {input}
'''

SnakeMake SAMtools From line 1494 of main/Snakefile

shell:'''
grep "^@" {input} > {output}
'''

SnakeMake From line 1505 of main/Snakefile

script: "scripts/rm_unmapped_read_name_prefixes.R"

SnakeMake From line 1517 of main/Snakefile

shell:'''
seqtk subseq {input.reads} {input.lst} > {output}
'''

SnakeMake seqtk subSeq From line 1529 of main/Snakefile

shell:'''
paladin align -C -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''

SnakeMake SAMtools From line 1542 of main/Snakefile

shell:'''
samtools view -b -f 4 {input} > {output}
'''

SnakeMake SAMtools From line 1554 of main/Snakefile

shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''

SnakeMake SAMtools From line 1565 of main/Snakefile

shell:'''
samtools fastq -N -0 {output} {input}
'''

SnakeMake SAMtools From line 1577 of main/Snakefile

shell:'''
grep "^@" {input} > {output}
'''

SnakeMake From line 1589 of main/Snakefile

script: "scripts/rm_unmapped_read_name_prefixes.R"

SnakeMake From line 1601 of main/Snakefile

shell:'''
seqtk subseq {input.reads} {input.lst} > {output}
'''

SnakeMake seqtk subSeq From line 1613 of main/Snakefile

shell:"""
sourmash sketch dna -p k=51,scaled=2000 -o {output} --name {wildcards.sample}-{wildcards.acc_db} {input}
"""

SnakeMake From line 1629 of main/Snakefile

shell:'''
sourmash gather -o {output} --threshold-bp 0 --scaled 2000 -k 51 {input.sig} {input.db} 
'''

SnakeMake From line 1643 of main/Snakefile

shell:'''
megahit -r {input} --out-dir {output.tmp_dir} --out-prefix {wildcards.sample}-{wildcards.acc_db}
mv {output.tmp_dir}/{wildcards.sample}-{wildcards.acc_db}.contigs.fa {output.fa}
'''

SnakeMake MEGAHIT From line 1657 of main/Snakefile

shell:'''
prokka {input} --outdir {params.outdir} --prefix {wildcards.sample}-{wildcards.acc_db} \
    --metagenome --force --locustag {wildcards.sample} --cpus {threads} --centre X --compliant
'''