Metapangenomics Workflow for Amino Acid K-mers Comparison

public public 1yr ago Version: v1.0 0 bookmarks

This repository implements an example workflow for performing metapangenomics using amino acid k-mers, and compares this pipeline against existing metapangenome approaches. The results of this workflow are written up at github.com/dib-lab/2021-paper-metapangenomes.

Getting started

The workflow is written in snakemake with software managed by conda. The workflow can be re-run with the following (snakemake command assumes a slurm cluster, and executes as a dry run first).

conda env create --name orph --file environment.yml
conda activate orph
snakemake -j 16 --use-conda --rerun-incomplete --latency-wait 15 --resources mem_mb=200000 --cluster "sbatch -t 120 -J metap -p bmm -n 1 -N 1 -c {threads} --mem={resources.mem_mb}" -k -n

Code Snippets

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
library(dplyr)
library(readr)
library(purrr)
library(ggplot2)
library(tidyr)
library(aplot)

# read in metadata and results --------------------------------------------

destfile <- "inputs/gtdb-rs202.taxonomy.v2.csv"
url <- "https://osf.io/p6z3w/download"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto") 
}
gtdb_lineages <- read_csv(destfile)

destfile <- "inputs/hmp2_metadata.csv"
url <- "https://ibdmdb.org/tunnel/products/HMP2/Metadata/hmp2_metadata.csv"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto") 
}
hmp_metadata <- read_csv(destfile)
h4017 <- hmp_metadata %>%
  select(participant_id = "Participant ID", data_id = "External ID", data_type, 
         week_num, diagnosis, antibiotics = "Antibiotics") %>%
  filter(data_type == "metagenomics") %>%
  filter(data_id %in% c('HSM67VF9', 'HSM67VFD', 'HSM67VFJ', 'HSM6XRQB',
                        'HSM6XRQI', 'HSM6XRQK', 'HSM6XRQM', 'HSM6XRQO',
                        'HSM7CYY7', 'HSM7CYY9', 'HSM7CYYB', 'HSM7CYYD'))

#gather_results <- unlist(snakemake@input[['gather']]) %>%
gather_results <- Sys.glob("outputs/sample_gather/*genomic.csv") %>%
  set_names() %>%
  map_dfr(read_csv, col_types = c("dddddlllcccddddcccd"), .id = "sample") %>%
  mutate(sample = gsub("outputs/sample_gather/", "", sample)) %>%
  mutate(sample = gsub("_gather_gtdb-rs202-genomic.csv", "", sample)) %>%
  separate(col = name, into = c("accession"), remove = F, sep = " ", extra = "drop")

gather_results <- left_join(gather_results, gtdb_lineages, by = c("accession" = "ident")) %>%
  left_join(h4017, by = c("sample" = "data_id"))

# plot --------------------------------------------------------------------
h4017$antibiotic <- c("ciprofloxacin", "ciprofloxacin", "ciprofloxacin",
                      "none", "none", "metronidazole", "metronidazole",
                      "metronidazole", "none", "none", "none", "unknown")
h4017$antibiotic <- factor(h4017$antibiotic, levels =  c("ciprofloxacin", "metronidazole", "unknown", "none"))
abx_plt <- ggplot(h4017, aes(x = week_num, y = diagnosis, shape = antibiotic)) +
  geom_point(size = 2.5) +
  theme_minimal() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_text(size = 8),
        axis.text = element_blank(),
        legend.title = element_text(size = 8),
        legend.text = element_text(size = 7),
        panel.grid = element_blank())+
  labs(x = "week number", y = "antibiotic") 
abx_plt

# decide which genome to query with ---------------------------------------

common_species <- gather_results %>%
  select(sample, species) %>%
  distinct() %>%
  group_by(species) %>%
  tally() %>%
  filter(n == 12)

# common_species_plt <- ggplot(gather_results %>%
#                                filter(species %in% common_species$species),
#                              aes(x = week_num, y = f_unique_to_query)) +
#   geom_col() +
#   facet_wrap(~species) +
#   theme_minimal() 

sgc_species <- gather_results %>%
  filter(species %in% common_species$species) %>%
  group_by(species) %>%
  summarize(sum_f_unique_to_query = sum(f_unique_to_query)) %>%
  filter(sum_f_unique_to_query > 0.2)

gather_results2 <- gather_results %>%
  mutate(species2 = ifelse(species %in% sgc_species$species, species, "other")) %>%
  mutate(species2 = gsub("s__", "", species2))
gather_results2$species2 <- factor(gather_results2$species2, 
                                   levels =c('other', 
                                             'Bacteroides fragilis',
                                             'Bacteroides uniformis',
                                             'Enterocloster bolteae',
                                             'Parabacteroides distasonis',
                                             'Parabacteroides merdae',
                                             'Phocaeicola vulgatus'))
gather_results2 <- left_join(gather_results2, h4017)
sgc_plt <- ggplot(gather_results2,
                     aes(x = week_num, y = f_unique_to_query, fill = species2)) +
  geom_col() +
  geom_point(aes(x = week_num, y = 1, shape = antibiotic)) +
  labs(x = "week number", y = "fraction of metagenome", fill = "species")+
  theme_minimal() +
  theme(legend.title = element_text(size = 8),
        legend.text = element_text(size = 7, face = "italic"),
        axis.title = element_text(size = 8),
        axis.text = element_text(size = 7)) +
  ylim(0, 1) + 
  scale_fill_manual(values = c("#999999", "#E69F00", "#56B4E9", "#F0E442", "#009E73",
                               "#CC79A7", "#0072B2")) +
  guides(fill = guide_legend(override.aes = list(shape = NA)))
sgc_plt

pdf("figures/common_species_breakdown2.pdf", width = 6, height = 3.6)
#pdf(snakemake@output[['pdf']], width = 6, height = 3)
#abx_plt %>% insert_bottom(sgc_plt, height = 5)
sgc_plt
dev.off()

# output sgc queries as gather file ---------------------------------------

# get rep accession
sgc_genomes <- gather_results %>%
  filter(species %in% sgc_species$species) %>%
  group_by(name, species) %>%
  summarize(sum_f_unique_to_query = sum(f_unique_to_query)) %>%
  ungroup() %>%
  group_by(species) %>%
  arrange(desc(sum_f_unique_to_query)) %>%
  slice(1)

gather_sgc_queries <- unlist(snakemake@input[['gather']])[6] %>%
#gather_sgc_queries <- Sys.glob("outputs/sample_gather/*genomic.csv")[6] %>%
  map_dfr(read_csv, col_types = c("dddddlllcccddddcccd")) %>%
  filter(name %in% sgc_genomes$name)

write_csv(gather_sgc_queries, snakemake@output[['gather_grist']])
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import sys
import argparse
import urllib.request
import csv

from lxml import etree


def url_for_accession(accession):
    db, acc = accession.strip().split("_")
    if '.' in acc:
        number, version = acc.split(".")
    else:
        number, version = acc, '1'
    number = "/".join([number[p : p + 3] for p in range(0, len(number), 3)])
    url = f"https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{number}"
    print(f"opening directory: {url}", file=sys.stderr)

    with urllib.request.urlopen(url) as response:
        all_names = response.read()

    print("done!", file=sys.stderr)

    all_names = all_names.decode("utf-8")

    full_name = None
    for line in all_names.splitlines():
        if line.startswith(f'<a href='):
            name=line.split('"')[1][:-1]
            db_, acc_, *_ = name.split("_")
            if db_ == db and acc_.startswith(acc):
                full_name = name
                break

    if full_name is None:
        return None
    else:
        url = "htt" + url[3:]
        return (
            f"{url}/{full_name}/{full_name}_genomic.fna.gz",
            f"{url}/{full_name}/{full_name}_assembly_report.txt",
        )


def get_taxid_from_assembly_report(url):
    print(f"opening assembly report: {url}", file=sys.stderr)
    with urllib.request.urlopen(url) as response:
        content = response.read()
    print("done!", file=sys.stderr)

    content = content.decode("utf-8").splitlines()
    for line in content:
        if "Taxid:" in line:
            line = line.strip()
            pos = line.find("Taxid:")
            assert pos >= 0
            pos += len("Taxid:")
            taxid = line[pos:]
            taxid = taxid.strip()
            return taxid

    assert 0


def get_tax_name_for_taxid(taxid):
    tax_url = (
        f"https://www.ncbi.nlm.nih.gov/taxonomy/?term={taxid}&report=taxon&format=text"
    )
    print(f"opening tax url: {tax_url}", file=sys.stderr)
    with urllib.request.urlopen(tax_url) as response:
        content = response.read()

    print("done!", file=sys.stderr)

    root = etree.fromstring(content)
    notags = etree.tostring(root).decode("utf-8")
    if notags.startswith("<pre>"):
        notags = notags[5:]
    if notags.endswith("</pre>"):
        notags = notags[:-6]
    notags = notags.strip()

    return notags


def main():
    p = argparse.ArgumentParser()
    p.add_argument("accession")
    p.add_argument("-o", "--output")
    args = p.parse_args()

    fieldnames = ["acc", "genome_url", "assembly_report_url", "ncbi_tax_name"]
    fp = None
    if args.output:
        fp = open(args.output, "wt")
        w = csv.DictWriter(fp, fieldnames=fieldnames)
    else:
        w = csv.DictWriter(sys.stdout, fieldnames=fieldnames)
    w.writeheader()

    acc = args.accession

    genome_url, assembly_report_url = url_for_accession(acc)
    taxid = get_taxid_from_assembly_report(assembly_report_url)
    tax_name = get_tax_name_for_taxid(taxid)

    d = dict(
        acc=acc,
        genome_url=genome_url,
        assembly_report_url=assembly_report_url,
        ncbi_tax_name=tax_name,
    )

    w.writerow(d)
    print(f"retrieved for {acc} - {tax_name}", file=sys.stderr)

    if fp:
        fp.close()

    return 0


if __name__ == "__main__":
    sys.exit(main())
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
library(dplyr)
library(readr)
library(purrr)
library(tidyr)

gtdb_lineages <- read_csv(snakemake@input[['gtdb_lineages']])

charcoal_lineages <- Sys.glob(paste0("outputs/metabat2_gather/", snakemake@wildcards[['sample']], "_bin.*_gather_gtdb-rs202-genomic.csv")) %>%
#charcoal_lineages <- unlist(snakemake@input[['gather']]) %>% 
  map_dfr(read_csv, col_types = "dddddlllcccddddcccd") %>%
  filter(gather_result_rank == 0) %>%
  select(query_name, name) %>%
  mutate(query_name = gsub(".*\\.", "", query_name),
         query_name = gsub("bin", "bin.", query_name),
         query_name = paste0(query_name, ".fa"),
         accession = gsub(" .*", "", name)) %>%
  left_join(gtdb_lineages, by = c("accession" = "ident")) %>%
  select(-name, -accession)

write_csv(charcoal_lineages, snakemake@output[['charcoal_lineages']])
1
2
3
install.packages("pagoo", repos="http://cran.us.r-project.org")
library(pagoo)
file.create(snakemake@output[['pagoo']])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
library(readr)
library(dplyr)

#path <- "outputs/nbhd_sketch_tables_species/GCA_000162535.1-s__Parabacteroides_distasonis_long.csv"
#sketch_table <- read_csv(path)

sketch_table <- read_csv(snakemake@input[['csv']])
# filter samples that don't have enough k-mers for the species
# (using 10000 as threshold)
sketch_table_grp <- sketch_table %>%
  group_by(acc) %>%
  tally()

keep <- sketch_table_grp %>%
  filter(n > 10000)

pg_sigs <- sketch_table %>%
  filter(acc %in% keep$acc) %>%
  select(acc) %>%
  distinct() %>%
  mutate(acc = gsub("\\.G", "-G", acc), 
         acc = gsub("\\.s", "-s", acc),
         acc = paste0("outputs/nbhd_sigs_species/", acc, ".sig"))

write_tsv(pg_sigs, snakemake@output[['lst']], col_names = FALSE)
1
2
3
4
5
6
7
8
9
library(readr)
library(dplyr)

files <- read_tsv(snakemake@input[['txt']], col_names = "bins")
files <- files %>%
  mutate(bins = gsub("_sigs", "", bins),
         bins = gsub("\\.sig", "\\.ffn", bins))

write_tsv(files, snakemake@output[['txt']], col_names = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
library(readr)
library(dplyr)
library(tidyr)
library(purrr)

gtdb_lineages <- read_csv(snakemake@input[['lineages']])

# this specified as a sysglob to not mess up the sample wildcard in the workflow
gather_bins <- Sys.glob("outputs/metabat2_gather/*csv") %>%
#gather_bins <- unlist(snakemake@input[['gather']]) %>%
  map_dfr(read_csv, col_types = c("dddddlllcccddddcccd")) %>%
  separate(col = name, into = c("accession"), remove = F, sep = " ", extra = "drop") %>%
  mutate(sample = basename(query_filename)) %>%
  mutate(sample = gsub("-.*", "", sample)) %>%
  left_join(gtdb_lineages, by = c("accession" = "ident"))

species_of_interest <- read_csv(unlist(snakemake@input[['acc_to_db']])) %>%
  #filter(!species %in% c("s__Ruminococcus_B_gnavus", "s__Clostridium_Q_symbiosum", "s__Roseburia_intestinalis")) %>%
  mutate(species = gsub("(s__.*?)_", "\\1 ", species))

acc_to_db <- read_csv(unlist(snakemake@input[['acc_to_db']])) %>%
  #filter(!species %in% c("s__Ruminococcus_B_gnavus", "s__Clostridium_Q_symbiosum", "s__Roseburia_intestinalis")) %>%
  mutate(acc_to_db = paste0(accession, "-", species)) %>%
  mutate(species = gsub("(s__.*?)_", "\\1 ", species))

# only consider best match for the bin
gather_bins <- gather_bins %>%
  filter(gather_result_rank == 0)

named_bins <- gather_bins %>% 
  left_join(acc_to_db, by = "species") %>%
  select(acc_to_db, query_filename) %>%
  mutate(sig_filename = gsub("metabat2", "metabat2_prokka_sigs", query_filename),
         sig_filename = gsub("/bin", "_bin", sig_filename),
         sig_filename = gsub(".fa", ".sig", sig_filename)) %>%
  select(acc_to_db, sig_filename)

# filter to single acc_db using wildcard
# and only write out sig path
tmp <- named_bins %>%
  filter(acc_to_db %in% snakemake@wildcards[["acc_db"]]) %>%
  select(sig_filename)

write_tsv(tmp, snakemake@output[['txt']], col_names = FALSE)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
library(pagoo)
library(readr)
library(ggplot2)
library(dplyr)

read_long_sketch_table_as_pagoo <- function(path, threshold = 2000){
  sketch_table <- read_csv(path)

  # filter samples that don't have enough k-mers for the species
  sketch_table_grp <- sketch_table %>%
    group_by(acc) %>%
    tally()

  print(sketch_table_grp)

  keep <- sketch_table_grp %>%
    filter(n > threshold)

  sketch_table <- sketch_table %>%
    filter(acc %in% keep$acc) %>%
    select(gene    = minhash, 
           org     = acc,
           cluster = minhash) 
  p <- pagoo(data = as.data.frame(sketch_table))
  return(p)
}

#list.files("outputs/nbhd_sketch_tables_species/")
#species_string1 <- ""
species_string1 <- snakemake@wildcards[['acc_db']]
species_string2 <- gsub("-", ".", species_string1)
species_string3 <- gsub(".*s__", "", species_string2)
species_string3 <- gsub("_", " ", species_string3)

pg <- read_long_sketch_table_as_pagoo(snakemake@input[['csv']], threshold = 0)
#pg <- read_long_sketch_table_as_pagoo(paste0("outputs/nbhd_sketch_tables_species/", 
#                                             species_string1, "_long.csv"), threshold = 0)
pg_organisms <- gsub(paste0(".", species_string2), "", as.character(pg$organisms$org))

destfile <- "inputs/hmp2_metadata.csv"
url <- "https://ibdmdb.org/tunnel/products/HMP2/Metadata/hmp2_metadata.csv"
if (!file.exists(destfile)) {
  download.file(url, destfile, method="auto")
}
hmp_metadata <- read_csv(destfile)

# serology data_type contains information on abx types
# tmp <- hmp_metadata %>%
#   select(participant_id = "Participant ID", data_id = "External ID", data_type,
#          week_num, diagnosis, antibiotics = "Antibiotics",
#          fecalcal, metronidazole = "Flagyl (Metronidazole)",
#          cipro = "Cipro (Ciprofloxin)", rifaxamin = "Xifaxin (rifaxamin)",
#          levaquin = Levaquin, other_abx = "Other Antibiotic:") %>%
#   filter(participant_id =="H4017")

h4017 <- hmp_metadata %>%
  select(data_id = "External ID", data_type,
         week_num, diagnosis, antibiotics = "Antibiotics") %>%
  filter(data_type == "metagenomics") %>%
  filter(data_id %in% c('HSM67VF9', 'HSM67VFD', 'HSM67VFJ', 'HSM6XRQB',
                        'HSM6XRQI', 'HSM6XRQK', 'HSM6XRQM', 'HSM6XRQO',
                        'HSM7CYY7', 'HSM7CYY9', 'HSM7CYYB', 'HSM7CYYD'))

h4017 <- h4017 %>%
  mutate(org = paste0(data_id, ".", species_string2)) %>%
  filter(data_id %in% pg_organisms)

pg$add_metadata(map = "org", as.data.frame(h4017))

pdf(snakemake@output[['pca']], height = 3, width = 4)
pg$gg_pca() +
  geom_point(aes(color = antibiotics)) +
  theme_minimal() +
  theme(plot.title = element_text(face = "italic")) +
  labs(title = species_string3) +
  scale_color_manual(values = c("#1F78B4", "#33A02C"))
dev.off()

# try color binmap --------------------------------------------------------

tpm <- t(pg$pan_matrix)
tpm[which(tpm > 0, arr.ind = TRUE)] <- 1L
bm <- as.data.frame(tpm)
or <- order(rowSums(bm), decreasing = TRUE)
lvls <- rownames(bm)[or]
bm$Cluster <- factor(rownames(bm), levels = lvls)
bm <- reshape2::melt(bm, 'Cluster')
bm$value <- factor(bm$value, levels = c(1, 0))
bm <- left_join(bm, h4017, by = c("variable" = "org"))
bm$value_abx <- paste0(bm$value, "_", bm$antibiotics)
bm$value_abx <- factor(bm$value_abx, levels = c("0_No", "1_No", "0_Yes",  "1_Yes"))
colnames(bm)[which(colnames(bm) == 'variable')] <- "Organism"

#pdf(paste0("outputs/pagoo_species/", species_string1, "_binmap.pdf"), height = 3, width = 4)
pdf(snakemake@output[['binmap']], height = 3, width = 4)
ggplot(bm, aes(Cluster, as.factor(week_num), fill=value_abx)) +
  geom_raster() +
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        plot.title = element_text(face = "italic")) +
  #scale_fill_grey(start = .2, end = .9)
  scale_fill_brewer(palette = "Paired", labels = c("0, No", "1, No", "0, Yes", "1, Yes")) +
  labs(x = "protein k-mer", y = "week number", fill = "presence\n& antibiotics",
       title = species_string3)
dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
library(dplyr)
library(readr)

#lineages <- read_csv("inputs/gtdb-rs202.taxonomy.v2.csv")
lineages <- read_csv(snakemake@input[['lineages']])

gather <- read_csv(snakemake@input[['gather']]) %>%
#gather <- read_csv("outputs/genbank/gather_gtdb-rs202-genomic.x.genbank.gather.csv") %>%
  mutate(accession = gsub(" .*", "", name)) %>%
  left_join(lineages, by = c("accession" = "ident")) %>%
  select(accession, species) %>%
  #mutate(species = gsub(" ", "_", species),
  #       database = paste0("gtdb-rs202.", species, ".protein-k10.nodegraph"))
  mutate(species = gsub(" ", "_", species))

write_csv(gather, snakemake@output[['csv']])
1
2
3
4
5
6
7
library(dplyr)
library(readr)

read_names <- read_tsv(snakemake@input[['names']], col_names = c("names")) %>%
  mutate(names = gsub("^@[0-9]*:[0-5]:[0-5]:", "", names))

write_tsv(read_names, snakemake@output[["lst"]], col_names = FALSE)
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from sourmash import signature
import pandas as pd
import os
import sys
import argparse

def main():
    p = argparse.ArgumentParser()
    p.add_argument('signature')       # sourmash signature
    p.add_argument('output')          # output csv file name
    args = p.parse_args()

    # load the signature from disk
    sigfp = open(args.signature, 'rt')
    siglist = list(signature.load_signatures(sigfp))
    loaded_sig = siglist[0]

    # Get the minhashes
    mins = loaded_sig.minhash.hashes.keys()

    name = os.path.basename(args.signature)
    df = pd.DataFrame(mins, columns=[name])

    # write to a csv
    df.to_csv(args.output, index = False)

if __name__ == '__main__':
    sys.exit(main())
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
files <- snakemake@input
files <- unlist(files, use.names=FALSE)

## read files into list with each row labelled by accession. 
tab_long <- list()
for(i in 1:length(files)){
  print(i)
  sig <- read.csv(files[i])           # read in signature csv as df
  acc <- colnames(sig)[1]             # set lib name using sample
  acc <- gsub("*.sig", "", acc)       # remove file suffix
  sig$acc <- acc                      # set libname as col
  colnames(sig) <- c("minhash", "acc")
  tab_long[[i]] <- sig
}

## bind into one dataframe
tab_long <- do.call(rbind, tab_long)
tab_long$present <- 1 # add a col with "1" for presence/absence
write.csv(tab_long, snakemake@output[['csv']],
          quote = F, row.names = F)
133
134
135
136
137
138
139
140
141
142
143
shell:'''
fastp --in1 {input.R1} \
  --in2 {input.R2} \
  --out1 {output.R1} \
  --out2 {output.R2} \
  --detect_adapter_for_pe \
  --qualified_quality_phred 4 \
  --length_required 31 --correction \
  --json {output.json} \
  --html {output.html}
'''
163
164
165
shell:'''
bbduk.sh -Xmx64g t=3 in={input.r1} in2={input.r2} out={output.r1} out2={output.r2} outm={output.human_r1} outm2={output.human_r2} k=31 ref={input.human}
'''
178
179
180
shell:'''
interleave-reads.py {input} | trim-low-abund.py --gzip -C 3 -Z 18 -M 60e9 -V - -o {output}
'''
SnakeMake From line 178 of main/Snakefile
198
199
200
shell:'''
repair.sh in1={input} out1={output.r1} out2={output.r2} outs={output.singleton} repair
'''
SnakeMake From line 198 of main/Snakefile
214
215
216
217
shell:'''
megahit -1 {input.r1} -2 {input.r2} --out-dir {output.tmp_dir} --out-prefix {wildcards.sample}
mv {output.tmp_dir}/{wildcards.sample}.contigs.fa {output.fa}
'''
227
228
229
shell:'''
bowtie2-build {input} {input}    
'''
243
244
245
246
shell:'''
bowtie2 -x {input.fa} -1 {input.r1} -2 {input.r2} |
    samtools view -bS -o {output}
'''
256
257
258
shell:'''
samtools sort {input} -o {output}
'''
268
269
270
shell:'''
samtools index {input}
'''
282
283
284
shell:'''
jgi_summarize_bam_contig_depths --outputDepth {output} {input.bam}
'''
SnakeMake From line 282 of main/Snakefile
297
298
299
shell:'''
metabat2 -m 1500 -i {input.fa} --abdFile {input.depth} -o {params.outdir}
'''
309
310
311
shell:"""
sourmash sketch dna -p k=31,scaled=2000 -o {output} --name {wildcards.sample}.bin{wildcards.binn} {input}
"""
SnakeMake From line 309 of main/Snakefile
324
325
326
shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --scaled 2000 -k 31 {input.sig} {input.db} 
'''
SnakeMake From line 324 of main/Snakefile
347
script: "scripts/generate_charcoal_lineages.R"
SnakeMake From line 347 of main/Snakefile
364
365
366
shell:'''
ls {params.genome_dir}/*fa | xargs -n 1 basename > {output} 
'''
SnakeMake From line 364 of main/Snakefile
383
384
385
run:
    with open(output.conf, 'wt') as fp:
        print(f"""\
SnakeMake From line 383 of main/Snakefile
405
406
407
shell:'''
python -m charcoal run {input.conf} -j {threads} clean --nolock --latency-wait 15 --rerun-incomplete --use-conda
'''
SnakeMake From line 405 of main/Snakefile
416
417
418
shell:'''
gunzip -c {input} > {output}
'''
SnakeMake From line 416 of main/Snakefile
430
431
432
433
shell:'''
prokka {input} --outdir {params.outdir} --prefix {wildcards.sample}_bin.{wildcards.bin} \
    --metagenome --force --locustag {wildcards.sample}_bin.{wildcards.bin} --cpus {threads} --centre X --compliant
'''
443
444
445
shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.sample}_bin.{wildcards.bin} {input}
"""
SnakeMake From line 443 of main/Snakefile
470
script: "scripts/metabat_generate_lists_of_bins_of_same_species.R"
SnakeMake From line 470 of main/Snakefile
480
481
482
shell:"""
sourmash sig intersect -o {output} --from-file {input} 
"""
SnakeMake From line 480 of main/Snakefile
492
493
494
shell:"""
sourmash sig rename -o {output} {input} {wildcards.acc_db}_metabat2_core
"""
SnakeMake From line 492 of main/Snakefile
504
505
506
shell:"""
sourmash sig merge --name {wildcards.acc_db}_metabat2_all -o {output} --from-file {input} 
"""
SnakeMake From line 504 of main/Snakefile
516
517
518
shell:"""
python scripts/sig_to_csv.py {input} {output}
"""
SnakeMake From line 516 of main/Snakefile
533
534
535
shell:"""
sourmash sketch dna -p k=31,scaled=2000 -o {output} --name {wildcards.sample} {input}
"""
SnakeMake From line 533 of main/Snakefile
551
552
553
shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --output-unassigned {output.un} --scaled 2000 -k 31 {input.sig} {input.db} 
'''
SnakeMake From line 551 of main/Snakefile
569
570
571
shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --output-unassigned {output.un} --scaled 2000 -k 31 {input.sig} {input.db} 
'''
SnakeMake From line 569 of main/Snakefile
586
script: "scripts/gather_gtdb_to_sgc_queries.R"
SnakeMake From line 586 of main/Snakefile
619
620
621
622
shell: """
    python scripts/genbank_genomes.py {wildcards.acc} \
        --output {output.csvfile}
"""
SnakeMake From line 619 of main/Snakefile
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
run:
    with open(input.csvfile, 'rt') as infp:
        r = csv.DictReader(infp)
        rows = list(r)
        assert len(rows) == 1
        row = rows[0]
        acc = row['acc']
        assert wildcards.acc.startswith(acc)
        url = row['genome_url']
        name = row['ncbi_tax_name']

        print(f"downloading genome for acc {acc}/{name} from NCBI...", file=sys.stderr)
        with open(output.genome, 'wb') as outfp:
            with urllib.request.urlopen(url) as response:
                content = response.read()
                outfp.write(content)
                print(f"...wrote {len(content)} bytes to {output.genome}",
                    file=sys.stderr)
SnakeMake From line 647 of main/Snakefile
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
run:
    # something weird is happening here where snakemake solves the inputs correctly, but 
    # "queries" ends up being the gather csv file. So...parse the CSV file for accession
    # numbers. Sigh.
    gather_csv = input[0]
    genome_accs = []
    with open(gather_csv, 'rt') as fp:
        r = csv.DictReader(fp)
        for row in r:
            acc = row['name'].split(' ')[0]
            acc = "outputs/genbank_genomes/" + acc + "_genomic.fna.gz"
            genome_accs.append(acc)

    query_list = "\n- ".join(genome_accs)
    #query_list = "\n- ".join(input.queries)
    with open(output.conf, 'wt') as fp:
       print(f"""\
716
717
718
shell:'''
python -m spacegraphcats run {input.conf} extract_reads --nolock --outdir={params.outdir} --rerun-incomplete 
'''
SnakeMake From line 716 of main/Snakefile
727
728
729
shell:'''
ls {output}
'''
SnakeMake From line 727 of main/Snakefile
740
741
742
shell:"""
sourmash sketch dna -p k=31,scaled=2000 -o {output} --name {wildcards.acc} {input}
"""
SnakeMake From line 740 of main/Snakefile
758
759
760
shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --output-unassigned {output.un} --scaled 2000 -k 31 {input.sig} {input.db} 
'''
SnakeMake From line 758 of main/Snakefile
778
779
780
shell:'''
orpheum translate --alphabet protein --peptide-ksize 10  --peptides-are-bloom-filter --noncoding-nucleotide-fasta {output.nuc_noncoding} --coding-nucleotide-fasta {output.nuc} --csv {output.csv} --json-summary {output.json} {input.ref} {input.fasta} > {output.pep}
'''
SnakeMake From line 778 of main/Snakefile
791
792
793
shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc} {input}
"""
SnakeMake From line 791 of main/Snakefile
804
805
806
shell:'''
python scripts/sig_to_csv.py {input} {output}
'''
SnakeMake From line 804 of main/Snakefile
818
script: "scripts/sketch_csv_to_long.R"
SnakeMake From line 818 of main/Snakefile
838
script: "scripts/query_to_species_db.R"
SnakeMake From line 838 of main/Snakefile
856
857
858
shell:'''
orpheum translate --jaccard-threshold 0.39 --alphabet protein --peptide-ksize 10  --peptides-are-bloom-filter --noncoding-nucleotide-fasta {output.nuc_noncoding} --coding-nucleotide-fasta {output.nuc} --csv {output.csv} --json-summary {output.json} {input.ref} {input.fasta} > {output.pep}
'''
SnakeMake From line 856 of main/Snakefile
869
870
871
shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc_db} {input}
"""
SnakeMake From line 869 of main/Snakefile
882
883
884
shell:'''
python scripts/sig_to_csv.py {input} {output}
'''
SnakeMake From line 882 of main/Snakefile
896
script: "scripts/sketch_csv_to_long.R"
SnakeMake From line 896 of main/Snakefile
906
script: "scripts/install_pagoo.R"
SnakeMake From line 906 of main/Snakefile
921
script: "scripts/pagoo_plt.R"
SnakeMake From line 921 of main/Snakefile
933
934
935
shell:"""
sourmash sketch dna -p k=51,scaled=2000 -o {output} --name {wildcards.acc_db} {input}
"""
SnakeMake From line 933 of main/Snakefile
950
951
952
shell:'''
sourmash gather -o {output.csv} --threshold-bp 0 --save-matches {output.matches} --scaled 2000 -k 51 {input.sig} {input.db} 
'''
SnakeMake From line 950 of main/Snakefile
963
script: "scripts/kaa_generate_list_of_sigs_to_intersect.R"
SnakeMake From line 963 of main/Snakefile
973
974
975
shell:"""
sourmash sig intersect -o {output} --from-file {input} 
"""
SnakeMake From line 973 of main/Snakefile
985
986
987
shell:"""
sourmash sig rename -o {output} {input} {wildcards.acc_db}_kaa_core
"""
SnakeMake From line 985 of main/Snakefile
 998
 999
1000
shell:"""
sourmash sig merge --name {wildcards.acc_db}_kaa_all -o {output} {input} 
"""
SnakeMake From line 998 of main/Snakefile
1010
1011
1012
shell:"""
python scripts/sig_to_csv.py {input} {output}
"""
SnakeMake From line 1010 of main/Snakefile
1031
1032
1033
shell:'''
sourmash compare -o {output.comp} --csv {output.csv} {input}
'''
SnakeMake From line 1031 of main/Snakefile
1048
1049
1050
shell:'''
sourmash compare --max-containment -o {output.comp} --csv {output.csv} {input}
'''
SnakeMake From line 1048 of main/Snakefile
1068
1069
1070
shell:'''
grep ">" {input} > {output}
'''
SnakeMake From line 1068 of main/Snakefile
1083
script: "scripts/identify_core_gene_sequence_names.R"
SnakeMake From line 1083 of main/Snakefile
1095
1096
1097
shell:'''
seqtk subseq {input.fa} {input.core_names} > {output}
'''
1107
1108
1109
shell:'''
transeq {input} {output}
'''
SnakeMake From line 1107 of main/Snakefile
1119
1120
1121
shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc_db}_roary_core {input}
"""
SnakeMake From line 1119 of main/Snakefile
1131
1132
1133
shell:"""
sourmash sketch protein -p k=10,scaled=100,protein -o {output} --name {wildcards.acc_db}_roary_all {input}
"""
SnakeMake From line 1131 of main/Snakefile
1143
1144
1145
shell:"""
python scripts/sig_to_csv.py {input} {output}
"""
SnakeMake From line 1143 of main/Snakefile
1162
1163
1164
shell:"""
sourmash compare --csv {output} {input}
"""
SnakeMake From line 1162 of main/Snakefile
1177
1178
1179
shell:"""
sourmash compare --containment --csv {output} {input}
"""
SnakeMake From line 1177 of main/Snakefile
1192
1193
1194
shell:"""
sourmash compare --max-containment --csv {output} {input}
"""
SnakeMake From line 1192 of main/Snakefile
1213
1214
1215
shell:"""
sourmash compare --csv {output} {input}
"""
SnakeMake From line 1213 of main/Snakefile
1228
1229
1230
shell:"""
sourmash compare --containment --csv {output} {input}
"""
SnakeMake From line 1228 of main/Snakefile
1243
1244
1245
shell:"""
sourmash compare --max-containment --csv {output} {input}
"""
SnakeMake From line 1243 of main/Snakefile
1259
script: "scripts/metabat_generate_lists_of_bins_of_same_species_prokka_ffn.R"
SnakeMake From line 1259 of main/Snakefile
1268
1269
1270
shell:'''
xargs cat < {input} > {output}
'''
SnakeMake From line 1268 of main/Snakefile
1280
1281
1282
shell:'''
cd-hit-est -c .95 -d 0 -i {input} -o {output}
'''
1290
1291
1292
shell:'''
bwa index {input}
''' 
1300
1301
1302
shell:'''
bwa index {input}
''' 
1313
1314
1315
shell:'''
bwa mem -p -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''
1324
1325
1326
shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''
1335
1336
1337
shell:'''
samtools view -b -f 4 {input} > {output}
'''
1347
1348
1349
shell:'''
samtools fastq -N -0 {output} {input}
'''
1360
1361
1362
shell:'''
bwa mem -p -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''
1371
1372
1373
shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''
1382
1383
1384
shell:'''
samtools view -b -f 4 {input} > {output}
'''
1394
1395
1396
shell:'''
samtools fastq -N -0 {output} {input}
'''
1410
1411
1412
shell:'''
transeq {input} {output}
'''
SnakeMake From line 1410 of main/Snakefile
1422
1423
1424
shell:'''
paladin index -r3 {input}
'''
SnakeMake From line 1422 of main/Snakefile
1434
1435
1436
shell:'''
transeq {input} {output}
'''
SnakeMake From line 1434 of main/Snakefile
1446
1447
1448
shell:'''
paladin index -r3 {input}
'''
SnakeMake From line 1446 of main/Snakefile
1459
1460
1461
shell:'''
paladin align -C -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''
1470
1471
1472
shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''
1482
1483
1484
shell:'''
samtools view -b -f 4 {input} > {output}
'''
1494
1495
1496
shell:'''
samtools fastq -N -0 {output} {input}
'''
1505
1506
1507
shell:'''
grep "^@" {input} > {output}
'''
SnakeMake From line 1505 of main/Snakefile
1517
script: "scripts/rm_unmapped_read_name_prefixes.R"
SnakeMake From line 1517 of main/Snakefile
1529
1530
1531
shell:'''
seqtk subseq {input.reads} {input.lst} > {output}
'''
1542
1543
1544
shell:'''
paladin align -C -t {threads} {input.ref_assembly} {input.reads} | samtools sort -o {output} -
'''
1554
1555
1556
shell:'''
samtools view -b -f 4 {input} > {output}
'''
1565
1566
1567
shell:'''
samtools view -h {input} | samtools stats | grep '^SN' | cut -f 2- > {output}
'''
1577
1578
1579
shell:'''
samtools fastq -N -0 {output} {input}
'''
1589
1590
1591
shell:'''
grep "^@" {input} > {output}
'''
SnakeMake From line 1589 of main/Snakefile
1601
script: "scripts/rm_unmapped_read_name_prefixes.R"
SnakeMake From line 1601 of main/Snakefile
1613
1614
1615
shell:'''
seqtk subseq {input.reads} {input.lst} > {output}
'''
1629
1630
1631
shell:"""
sourmash sketch dna -p k=51,scaled=2000 -o {output} --name {wildcards.sample}-{wildcards.acc_db} {input}
"""
SnakeMake From line 1629 of main/Snakefile
1643
1644
1645
shell:'''
sourmash gather -o {output} --threshold-bp 0 --scaled 2000 -k 51 {input.sig} {input.db} 
'''
SnakeMake From line 1643 of main/Snakefile
1657
1658
1659
1660
shell:'''
megahit -r {input} --out-dir {output.tmp_dir} --out-prefix {wildcards.sample}-{wildcards.acc_db}
mv {output.tmp_dir}/{wildcards.sample}-{wildcards.acc_db}.contigs.fa {output.fa}
'''
1672
1673
1674
1675
shell:'''
prokka {input} --outdir {params.outdir} --prefix {wildcards.sample}-{wildcards.acc_db} \
    --metagenome --force --locustag {wildcards.sample} --cpus {threads} --centre X --compliant
'''
1684
1685
1686
shell:'''
cat {input} > {output}
'''
SnakeMake From line 1684 of main/Snakefile
ShowHide 102 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/dib-lab/2021-metapangenome-example
Name: 2021-metapangenome-example
Version: v1.0
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: BSD 3-Clause "New" or "Revised" License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...