Snakemake workflow for CITE-seq analaysis with alevin-fry and seurat

public 1yr ago 0 bookmarks

View Workflow

cite-seq-alevin-fry-seurat — View Workflow

This workflow is untested and work in progress. It is based on this tutorial .

Code Snippets

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess
import sys
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
fmt = snakemake.params.fmt
build = snakemake.params.build
flavor = snakemake.params.get("flavor", "")

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

if flavor:
    flavor += "."

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

suffix = ""
if fmt == "gtf":
    suffix = "gtf.gz"
elif fmt == "gff3":
    suffix = "gff3.gz"

url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
    release=release,
    build=build,
    species=species,
    fmt=fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)

try:
    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-annotation/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

success = False
for suffix in suffixes:
    url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species_cap}.{spec}.{suffix}".format(
        release=release,
        species=species,
        datatype=datatype,
        spec=spec.format(build=build, release=release),
        suffix=suffix,
        species_cap=species.capitalize(),
        branch=branch,
    )

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    print(
        "Unable to download requested sequence data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-sequence/wrapper.py

__author__ = "Tessa Pierce"
__copyright__ = "Copyright 2018, Tessa Pierce"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")

shell(
    "salmon index -t {snakemake.input} -i {snakemake.output} "
    " --threads {snakemake.threads} {extra} {log}"
)

Python Snakemake Salmon From line 3 of index/wrapper.py

__author__ = "Johannes Köster, Derek Croote"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import os
import tempfile
from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

outdir = os.path.dirname(snakemake.output[0])
if outdir:
    outdir = "--outdir {}".format(outdir)

extra = snakemake.params.get("extra", "")

with tempfile.TemporaryDirectory() as tmp:
    shell(
        "fasterq-dump --temp {tmp} --threads {snakemake.threads} "
        "{extra} {outdir} {snakemake.wildcards.accession} {log}"
    )

Python Snakemake From line 1 of fasterq-dump/wrapper.py

script:
    "../scripts/seurat.r"

SnakeMake From line 13 of rules/analysis.smk

script:
    "../scripts/plot-hto-counts.r"

SnakeMake From line 26 of rules/analysis.smk

script:
    "../scripts/filter-normalize-demux.r"

SnakeMake From line 41 of rules/analysis.smk

script:
    "../scripts/plot-counts-hto-filtered.r"

SnakeMake From line 55 of rules/analysis.smk

script:
    "../scripts/filter-negatives.r"

SnakeMake From line 68 of rules/analysis.smk

script:
    "../scripts/plot-umap-singlets-doublets.r"

SnakeMake From line 82 of rules/analysis.smk

script:
    "../scripts/filter-to-singlets.r"

SnakeMake From line 94 of rules/analysis.smk

wrapper:
    "0.74.0/bio/salmon/index"

SnakeMake From line 11 of rules/read_mapping.smk

shell:
    "salmon alevin --threads {threads} "
    "-l ISR -i {input.index} -1 {input.fq1} -2 {input.fq2} "
    "--read-geometry {params.sample[geometry][reads]} --bc-geometry {params.sample[geometry][barcodes]} "
    "--umi-geometry {params.sample[geometry][umis]} -o {output} --sketch -p 16 "
    "2> {log}"

SnakeMake Salmon From line 28 of rules/read_mapping.smk

shell:
    "(alevin-fry generate-permit-list -d fw -i {input} -o {output} -k &&"
    " alevin-fry collate -r {input} -i {output} -t {threads})"
    " 2> {log}"

SnakeMake From line 46 of rules/read_mapping.smk

shell:
    "alevin-fry quant -m {input.t2g} -i {input.rad} "
    "-o {output} -r cr-like -t {threads} --use-mtx 2> {log}"

SnakeMake Quant From line 63 of rules/read_mapping.smk

wrapper:
    "0.74.0/bio/reference/ensembl-sequence"

SnakeMake From line 12 of rules/ref.smk

wrapper:
    "0.74.0/bio/reference/ensembl-annotation"

SnakeMake From line 28 of rules/ref.smk

script:
    "../scripts/get-geneid2name.py"

SnakeMake From line 39 of rules/ref.smk

shell:
    "awk '{{print $1\"\\t\"$1;}}' {input} > {output} 2> {log}"

SnakeMake From line 69 of rules/ref.smk

wrapper:
    "0.74.0/bio/sra-tools/fasterq-dump"

SnakeMake From line 7 of rules/utils.smk

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressPackageStartupMessages({
    library(devtools)
    library(ggplot2)
    library(SingleCellExperiment)
    library(Seurat)
})

object <- readRDS(snakemake@input[[1]])

# remove the negatives
object <- subset(object, idents = "Negative", invert = TRUE)

saveRDS(object, snakemake@output[[1]])

R ggplot2 Seurat SingleCellExperiment devtools From line 1 of scripts/filter-negatives.r

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressPackageStartupMessages({
    library(devtools)
    library(ggplot2)
    library(SingleCellExperiment)
    library(Seurat)
})

object <- readRDS(snakemake@input[[1]])

# subsetting the data to Singlets
object <- subset(object, idents = "Singlet")

pdf(file = snakemake@output[["pdf"]])
VlnPlot(object, features = c("nCount_ADT"),  pt.size = 0.1)
dev.off()

saveRDS(object, file = snakemake@output[["rds"]])

R ggplot2 Seurat SingleCellExperiment devtools From line 1 of scripts/filter-to-singlets.r

import sys
sys.stderr = open(snakemake.log[0], "w")

from pybiomart import Dataset

prefix, suffix = snakemake.config["reference"]["species"].split("_", 1)
species = prefix[1] + suffix

dataset = Dataset(name=f"{species}_gene_ensembl", host="http://www.ensembl.org")

dataset.query(attributes=["ensembl_gene_id", "external_gene_name"]).to_csv(
    snakemake.output[0], sep="\t"
)

Python pybiomart From line 1 of scripts/get-geneid2name.py

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressPackageStartupMessages({
    library(devtools)
    library(ggplot2)
    library(SingleCellExperiment)
    library(Seurat)
})

object <- readRDS(snakemake@input[[1]])

pdf(file = snakemake@output[[1]])
VlnPlot(object, features = "nCount_HTO", pt.size = 0.1, log = TRUE)
dev.off()

pdf(file = snakemake@output[[1]])
VlnPlot(object, features = "nCount_RNA", pt.size = 0.1, log = TRUE)
dev.off()

R ggplot2 Seurat SingleCellExperiment devtools From line 1 of scripts/plot-counts-hto-filtered.r

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressPackageStartupMessages({
    library(devtools)
    library(ggplot2)
    library(SingleCellExperiment)
    library(Seurat)
})

seurat_object <- readRDS(snakemake@input[[1]])

pdf(file = snakemake@output[[1]])
VlnPlot(seurat_object, features = c("nCount_HTO"), log = TRUE)
dev.off()

R ggplot2 Seurat SingleCellExperiment devtools From line 1 of scripts/plot-hto-counts.r

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressPackageStartupMessages({
    library(devtools)
    library(ggplot2)
    library(SingleCellExperiment)
    library(Seurat)
})

object <- readRDS(snakemake@input[[1]])

# perform PCA and generate UMAP emdeddings
object <- RunPCA(object, reduction.name = "hto.pca", reduction.key = "HPC_", verbose = F, approx=FALSE)
object <- RunUMAP(object, reduction = "hto.pca", dims = 1:9, reduction.name = "hto.umap", 
                  reduction.key = "HUMAP_", umap.method = 'umap-learn', metric='correlation', verbose = F)

pdf(file = snakemake@output[["by_xlet"]])
DimPlot(object, reduction = "hto.umap", label = F)
dev.off()

pdf(file = snakemake@output[["by_hashtag"]])
DimPlot(object, reduction = "hto.umap", label = T, group.by = "hash.ID" )
dev.off()

R ggplot2 Seurat SingleCellExperiment devtools From line 1 of scripts/plot-umap-singlets-doublets.r

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressPackageStartupMessages({
    library(devtools)
    library(ggplot2)
    library(SingleCellExperiment)
    library(Seurat)
})

# set the seed 
set.seed(271828)

#' Read alevin-fry quantifications into a SingleCellExperiment object
load_fry <- function(frydir, which_counts = c('S', 'A'), verbose = FALSE) {
  suppressPackageStartupMessages({
    library(rjson)
    library(Matrix)
    library(SingleCellExperiment)
  })

  # read in metadata
  meta_info <- fromJSON(file = file.path(frydir, "meta_info.json"))
  ng <- meta_info$num_genes
  usa_mode <- meta_info$usa_mode

  if (usa_mode) {
    if (length(which_counts) == 0) {
      stop("Please at least provide one status in 'U' 'S' 'A' ")
    }
    if (verbose) {
      message("processing input in USA mode, will return ", paste(which_counts, collapse = '+'))
    }
  } else if (verbose) {
    message("processing input in standard mode, will return spliced count")
  }

  # read in count matrix
  af_raw <- readMM(file = file.path(frydir, "alevin", "quants_mat.mtx"))
  # if usa mode, each gene gets 3 rows, so the actual number of genes is ng/3
  if (usa_mode) {
    if (ng %% 3 != 0) {
      stop("The number of quantified targets is not a multiple of 3")
    }
    ng <- as.integer(ng/3)
  }

  # read in gene name file and cell barcode file
  afg <- read.csv(file.path(frydir, "alevin", "quants_mat_cols.txt"), 
                  strip.white = TRUE, header = FALSE, nrows = ng, 
                  col.names = c("gene_ids"), row.names = 1)
  afc <- read.csv(file.path(frydir, "alevin", "quants_mat_rows.txt"), 
                  strip.white = TRUE, header = FALSE,
                  col.names = c("barcodes"), row.names = 1)

  # if in usa_mode, sum up counts in different status according to which_counts
  if (usa_mode) {
    rd <- list("S" = seq(1, ng), "U" =  seq(ng + 1, 2 * ng),
               "A" =  seq(2 * ng + 1, 3 * ng))
    o <- af_raw[, rd[[which_counts[1]]], drop = FALSE]
    for (wc in which_counts[-1]) {
      o <- o + af_raw[, rd[[wc]], drop = FALSE]
    }
  } else {
    o <- af_raw
  }

  # create SingleCellExperiment object
  sce <- SingleCellExperiment(list(counts = t(o)),
                              colData = afc,
                              rowData = afg
  )
  sce
}

hto_q <- load_fry(snakemake@input[["hto"]], verbose = TRUE)
adt_q <- load_fry(snakemake@input[["adt"]], verbose = TRUE)
rna_q <- load_fry(snakemake@input[["rna"]], verbose = TRUE)

common.cells <- intersect(colnames(rna_q), colnames(adt_q))
common.cells <- intersect(common.cells , colnames(hto_q))

gid_to_gname <- read.table(snakemake@input[["geneid2name"]])
rownames(rna_q) <- gid_to_gname$V2[match(rownames(rna_q), gid_to_gname$V1)]

# seurat
seurat_object <- CreateSeuratObject(counts(rna_q)[, which(colnames(rna_q) %in% common.cells)])
seurat_object[["ADT"]] <- CreateAssayObject(counts = counts(adt_q)[, which(colnames(adt_q) %in% common.cells)])
seurat_object[["HTO"]] <- CreateAssayObject(counts = counts(hto_q)[, which(colnames(hto_q) %in% common.cells)])


saveRDS(seurat_object, snakemake@output[[1]])