Processing pipeline for public multiome and scATAC-seq immune datasets

public public 1yr ago 0 bookmarks

Processing pipeline for public single-cell epigenomic immune datasets.

Installing dependencies

All the required dependencies needed to run the workflow can be installed automatically by creating a new conda environment.

First ensure that conda or mamba is installed and available.

To create a new environment with the dependencies installed, run:

# using conda
conda env create -f environment.yaml
# using mamba
mamba env create -f environment.yaml

Running the workflow

This workflow involves downloading data from AWS using the AWS command line tools. To enable the download, you will first need to create an AWS account and set up the AWS command line tools by running aws configure . Note that some of the data downloaded may incur charges from AWS .

To run the Snakemake workflow, first activate the conda environment containing the required dependencies:

conda activate immune

Next, run snakemake with the desired options. Setting the -j parameter controls the maximum number of cores used by the workflow:

snakemake -j 24

See the snakemake documentation for a complete list of available options.

Datasets

PBMC

pbmc_atac_500

pbmc_atac_1k

pbmc_atac_5k

pbmc_atac_10k

pbmc_atac_10k_chromium

pbmc_atac_10k_chromiumX

pbmc_multiome_3k_sorted

pbmc_multiome_3k_unsorted

pbmc_multiome_10k_sorted

pbmc_multiome_10k_unsorted

pbmc_multiome_10k_chromium

pbmc_multiome_10k_chromiumX

pbmc_reference

BMMC

bmmc_atac

bmmc_multiome

bmmc_reference

Code Snippets

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
library(Signac)
library(Seurat)
library(SummarizedExperiment)
library(future)
library(GenomicRanges)

plan("multicore", workers = 8)
options(future.globals.maxSize = 100 * 1024 ^ 3)

### Load processed object for the metadata
orig.obj <- readRDS("data/bmmc_atac/scATAC-Healthy-Hematopoiesis-191120.rds")
annot <- readRDS("data/annotations_hg38.rds")

# get all fragment files
frag.files <- list.files("data/bmmc_atac/", pattern = "^scATAC_", full.names = TRUE)
frag.files <- paste0(frag.files, "/fragments.tsv.gz")
names(frag.files) <-  list.files("data/bmmc_atac/", pattern = "^scATAC_")

# call peaks on each and combine
pk.all <- list()
for (i in seq_along(frag.files)) {
  pks <- CallPeaks(
    object = frag.files[[i]]
  )
  pk.all[[i]] <- pks
}

gr.combined <- Reduce(f = c, x = pk.all)
gr.combined <- GenomicRanges::reduce(gr.combined)
gr.combined <- keepStandardChromosomes(gr.combined, pruning.mode = "coarse")

# quantify and create objects
md <- colData(orig.obj)

obj.list <- list()
for (i in seq_along(frag.files)) {
  obj_ident <- gsub(pattern = "scATAC_", x = names(frag.files)[[i]], replacement = "")
  cells.keep <- md[md$Group == obj_ident, "Barcode"]
  frags <- CreateFragmentObject(
    path = frag.files[[i]],
    cells = cells.keep
  )
  peakcounts <- FeatureMatrix(
    fragments = frags,
    features = gr.combined,
    cells = cells.keep
  )
  obj <- CreateChromatinAssay(counts = peakcounts, fragments = frags)
  obj <- CreateSeuratObject(counts = obj, assay = "ATAC")
  obj <- RenameCells(obj, add.cell.id = obj_ident)
  obj.list[[i]] <- obj
}

rownames(md) <- paste0(md$Group, "_", md$Barcode)
md <- as.data.frame(md)
bmmc <- merge(x = obj.list[[1]], y = obj.list[2:length(obj.list)])
bmmc <- AddMetaData(object = bmmc, metadata = md)
Annotation(bmmc) <- annot

bmmc$tissue <- sapply(strsplit(bmmc$Group, "_"), `[[`, 1)
bmmc$sample <- sapply(strsplit(bmmc$Group, "_"), `[[`, 2)

# remove PBMC
Idents(bmmc) <- "tissue"
bmmc <- subset(bmmc, idents = "PBMC", invert = TRUE)

bmmc <- NucleosomeSignal(bmmc)

# process
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc)
bmmc <- RunSVD(bmmc)
bmmc <- RunUMAP(bmmc, reduction = "lsi", dims = 2:40)
ga <- GeneActivity(bmmc)
bmmc[["GA"]] <- CreateAssayObject(counts = ga)

DefaultAssay(bmmc) <- "GA"
bmmc <- NormalizeData(bmmc)

DefaultAssay(bmmc) <- "ATAC"

# quantify multiome peaks in atac
bmmc_multiome <- readRDS("objects/bmmc_multiome.rds")

get_idf <- function(x) {
  rsums <- rowSums(x = x)
  idf <- ncol(x = x) / rsums
  idf <- log(1 + idf)
  return(idf)
}

multiome_idf <- get_idf(x = GetAssayData(bmmc_multiome, assay = "peaks", slot = "counts"))

multiome_counts <- FeatureMatrix(
  fragments = Fragments(bmmc),
  features = granges(bmmc_multiome),
  cells = colnames(bmmc)
)
bmmc[['peaks']] <- CreateChromatinAssay(
  counts = multiome_counts,
  fragments = Fragments(bmmc),
  annotation = Annotation(bmmc)
)

bmmc <- subset(bmmc, subset = nucleosome_signal < 5 & nCount_peaks < 50000)

DefaultAssay(bmmc) <- "peaks"
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc, idf = multiome_idf)
bmmc <- RunSVD(bmmc, scale.embeddings = FALSE)
bmmc <- RunUMAP(
  object = bmmc,
  reduction = "lsi",
  dims = 2:30,
  return.model = TRUE,
  reduction.name = "umap.peaks"
)

# integrate embeddings
atac <- SplitObject(bmmc, split.by = "Group")
atac <- lapply(atac, RunSVD, scale.embeddings = FALSE)

atac_anchors <- FindIntegrationAnchors(
  object.list = atac,
  reduction = "rlsi",
  dims = 2:30
)

atac_integrated <- IntegrateEmbeddings(
  anchorset = atac_anchors,
  new.reduction.name = "integrated_lsi",
  reductions = bmmc[["lsi"]],
  dims.to.integrate = 1:30
)
bmmc[["integrated_lsi"]] <- atac_integrated[["integrated_lsi"]]

saveRDS(object = bmmc, file = "objects/bmmc_atac.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(Signac)
library(Seurat)
library(Matrix)


rna_counts <- readMM(file = "data/bmmc_multiome/rna/counts.mtx")
rna_metadata <- read.table(file = "data/bmmc_multiome/rna/metadata.csv", sep = ",", header = TRUE, row.names = 1)
genes <- read.table(file = "data/bmmc_multiome/rna/genes.csv", sep = ",", header = TRUE, row.names = 1)

rna_counts <- t(rna_counts)
rna_counts <- as(rna_counts, "dgCMatrix")
rownames(rna_counts) <- rownames(genes)
colnames(rna_counts) <- rownames(rna_metadata)

annotations <- readRDS("data/annotations_hg38.rds")

multiome <- CreateSeuratObject(counts = rna_counts, project = "multiome", meta.data = rna_metadata)

atac_counts <- readMM(file = "data/bmmc_multiome/atac/counts.mtx")
atac_metadata <- read.table(file = "data/bmmc_multiome/atac/metadata.csv", sep = ",", header = TRUE, row.names = 1)
peaks <- read.table(file = "data/bmmc_multiome/atac/peaks.csv", sep = ",", header = TRUE, row.names = 1)

atac_counts <- t(atac_counts)
atac_counts <- as(atac_counts, "dgCMatrix")
rownames(atac_counts) <- rownames(peaks)
colnames(atac_counts) <- rownames(atac_metadata)

common_cells <- intersect(colnames(multiome), colnames(atac_counts))

atac_counts <- atac_counts[, common_cells]
multiome <- multiome[, common_cells]

multiome[["peaks"]] <- CreateChromatinAssay(counts = atac_counts, annotation = annotations)

# create fragment file for each sample
# name = name of cell in object
# element = name of cell in fragment file
frags <- list.files("data/bmmc_multiome", pattern = "*.tsv.gz$", recursive = TRUE, full.names = TRUE)
fraglist <- list()
for (i in seq_along(along.with = frags)) {
  f <- frags[[i]]
  sample.name <- gsub(pattern = "data/bmmc_multiome/", replacement = "", x = f)
  sample.name <- gsub(pattern = "/atac_fragments.tsv.gz", replacement = "", x = sample.name)
  cells.object <- colnames(multiome)[grepl(sample.name, colnames(multiome))]
  cells.fragfile <- sapply(cells.object, function(x) {
    a <- substr(x, start = 1, stop = 16)
    paste0(a, "-1")
  }, USE.NAMES = TRUE)
  fraglist[[i]] <- CreateFragmentObject(path = f, cells = cells.fragfile)
}

# process RNA
DefaultAssay(multiome) <- "RNA"

multiome <- FindVariableFeatures(multiome)
multiome <- NormalizeData(multiome)
multiome <- SCTransform(multiome)
multiome <- RunPCA(multiome)
multiome <- RunUMAP(
  object = multiome,
  reduction = "pca",
  dims = 1:40,
  reduction.name = "umap.sct",
  return.model = TRUE
)

# process ATAC
DefaultAssay(multiome) <- "peaks"
Fragments(multiome) <- fraglist

multiome <- FindTopFeatures(multiome)
multiome <- RunTFIDF(multiome)
multiome <- RunSVD(multiome)
multiome <- RunUMAP(
  object = multiome,
  reduction = "lsi",
  dims = 2:40,
  reduction.name = "umap.atac",
  return.model = TRUE
)

saveRDS(object = multiome, file = "objects/bmmc_multiome.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
library(Seurat)

obj <- readRDS("data/bmmc_reference/fullref.Rds")
obj <- NormalizeData(obj)
obj <- FindVariableFeatures(obj)
obj <- ScaleData(obj)
obj <- RunPCA(obj)
obj <- RunUMAP(
  object = obj,
  reduction = "pca",
  dims = 1:40,
  reduction.name = "umap",
  return.model = TRUE
)
obj$dataset <- 'reference'
saveRDS(obj, "objects/bmmc_reference.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import anndata
import sys
import os
from scipy import io
import pandas as pd

basepath = sys.argv[1]

bmmc = anndata.read_h5ad(basepath + "/GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad")
df = bmmc.var
atac_features = df.index[df['feature_types'] == 'ATAC'].tolist()
rna_features = df.index[df['feature_types'] == 'GEX'].tolist()
atac = bmmc[:, atac_features]
rna = bmmc[:, rna_features]

# RNA
if not os.path.isdir(basepath + "/rna"):
    os.makedirs(basepath + "/rna")
rnacounts = rna.layers["counts"]
io.mmwrite(target=basepath + "/rna/counts.mtx", a=rnacounts)
genenames = rna.var
rna_md = rna.obs
genenames.to_csv(basepath + "/rna/genes.csv")
rna_md.to_csv(basepath + "/rna/metadata.csv")

# ATAC
if not os.path.isdir(basepath + "/atac"):
    os.makedirs(basepath + "/atac")
ataccounts = atac.layers["counts"]
io.mmwrite(target=basepath + "/atac/counts.mtx", a=ataccounts)
peaknames = atac.var
atac_md = atac.obs
peaknames.to_csv(basepath + "/atac/peaks.csv")
atac_md.to_csv(basepath + "/atac/metadata.csv")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
library(Signac)
library(EnsDb.Hsapiens.v86)
library(GenomeInfoDb)

# extract gene annotations from EnsDb
annotations.hg38 <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v86)

# change to UCSC style since the data was mapped to hg38
seqlevelsStyle(annotations.hg38) <- 'UCSC'
genome(annotations.hg38) <- "hg38"

# save
saveRDS(object = annotations.hg38, file = "data/annotations_hg38.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(Seurat)
library(Signac)
library(GenomicRanges)

args = commandArgs(trailingOnly = TRUE)
ncore <- as.numeric(args[1])
annot.path <- args[2]
peak.path <- args[3]
sample_name <- args[4]

annot <- readRDS(annot.path)
peaks <- read.table(file = peak.path, col.names = c("chrom", "start", "end"))
peaks <- makeGRangesFromDataFrame(df = peaks)
peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")

message("Using ", ncore, " cores")
if (ncore > 1) {
  library(future)
  plan("multicore", workers = ncore)
  options(future.globals.maxSize = 100 * 1024^3)
}

# create object
countfile <- list.files(paste0("data/pbmc_atac/", sample_name), pattern = "*.h5$", full.names = TRUE)
frags <- list.files(paste0("data/pbmc_atac/", sample_name), pattern = "*.tsv.gz$", full.names = TRUE)
md_file <- list.files(paste0("data/pbmc_atac/", sample_name), patter = "*.csv$", full.names = TRUE)

counts <- Read10X_h5(filename = countfile)
metadata <- read.table(file = md_file, sep = ",", header = TRUE, row.names = 1)

assay <- CreateChromatinAssay(counts = counts, fragments = frags, genome = "hg38", annotation = annot, sep = c(":", "-"))
obj <- CreateSeuratObject(counts = assay, meta.data = metadata, assay = "ATAC")

# process ATAC
DefaultAssay(obj) <- "ATAC"
obj <- NucleosomeSignal(obj)
obj <- TSSEnrichment(obj)

# add gene activity
ga <- GeneActivity(object = obj)
obj[["GA"]] <- CreateAssayObject(counts = ga)
obj <- NormalizeData(object = obj, assay = 'GA')

# create assay containing shared peaks
common_counts <- FeatureMatrix(
  fragments = Fragments(obj)[[1]],
  features = peaks,
  cells = colnames(obj)
)
obj[["common"]] <- CreateChromatinAssay(counts = common_counts, fragments = frags, annotation = annot, genome = "hg38")

DefaultAssay(obj) <- "common"
obj <- FindTopFeatures(obj, min.cutoff = "q5")
obj <- RunTFIDF(obj)
obj <- RunSVD(obj)
obj <- RunUMAP(obj, reduction = "lsi", dims = 2:30, reduction.name = "umap.atac", store.model = TRUE)

# save object
saveRDS(object = obj, file = paste0("objects/", sample_name, ".rds"))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
library(Seurat)
library(Signac)
library(GenomicRanges)

args = commandArgs(trailingOnly = TRUE)
ncore <- as.numeric(args[1])
annot.path <- args[2]
peak.path <- args[3]
sample_name <- args[4]
ref <- args[5]

annot <- readRDS(annot.path)
peaks <- read.table(file = peak.path, col.names = c("chrom", "start", "end"))
peaks <- makeGRangesFromDataFrame(df = peaks)
peaks <- keepStandardChromosomes(peaks, pruning.mode = "coarse")

message("Using ", ncore, " cores")
if (ncore > 1) {
  library(future)
  plan("multicore", workers = ncore)
  options(future.globals.maxSize = 100 * 1024^3)
}

# create object
countfile <- list.files(paste0("data/pbmc_multiome/", sample_name), pattern = "*.h5$", full.names = TRUE)
frags <- list.files(paste0("data/pbmc_multiome/", sample_name), pattern = "*.tsv.gz$", full.names = TRUE)
md_file <- list.files(paste0("data/pbmc_multiome/", sample_name), patter = "*.csv$", full.names = TRUE)

counts <- Read10X_h5(filename = countfile)
metadata <- read.table(file = md_file, sep = ",", header = TRUE, row.names = 1)

obj <- CreateSeuratObject(counts = counts$`Gene Expression`, assay = "RNA", meta.data = metadata)
obj[["ATAC"]] <- CreateChromatinAssay(counts = counts$Peaks, fragments = frags, genome = "hg38", annotation = annot, sep = c(":", "-"))

# filter low/high count cells
obj <- subset(obj, subset = nCount_ATAC > 2000 &
                nCount_ATAC < 70000 &
                nCount_RNA > 500 & 
                nCount_RNA < 15000)

# process ATAC
DefaultAssay(obj) <- "ATAC"
obj <- NucleosomeSignal(obj)
obj <- TSSEnrichment(obj)

# create assay containing shared peaks
common_counts <- FeatureMatrix(
  fragments = Fragments(obj)[[1]],
  features = peaks,
  cells = colnames(obj)
)
obj[["common"]] <- CreateChromatinAssay(counts = common_counts, fragments = frags, annotation = annot, genome = "hg38")

DefaultAssay(obj) <- "common"
obj <- FindTopFeatures(obj, min.cutoff = "q5")
obj <- RunTFIDF(obj)
obj <- RunSVD(obj)
obj <- RunUMAP(obj, reduction = "lsi", dims = 2:30, reduction.name = "umap.atac", store.model = TRUE)

# process RNA
DefaultAssay(obj) <- "RNA"
obj <- FindVariableFeatures(obj)
obj <- NormalizeData(obj)
obj <- ScaleData(obj)
obj <- RunPCA(obj)
obj <- RunUMAP(obj, reduction = "pca", dims = 1:30, reduction.name = "umap.rna", store.model = TRUE)

# map to reference
reference <- readRDS(ref)

anchors <- FindTransferAnchors(
  reference = reference,
  query = obj,
  reference.assay = "SCT",
  query.assay = "RNA",
  dims = 1:30,
  normalization.method = "SCT",
  recompute.residuals = TRUE,
  reduction = 'cca'
)

pred <- TransferData(
    anchorset = anchors,
    refdata = reference$celltype.l2,
    weight.reduction = obj[['pca']],
    dims = 1:30
)
obj <- AddMetaData(object = obj, metadata = pred)

# save object
saveRDS(object = obj, file = paste0("objects/", sample_name, ".rds"))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
library(Seurat)
library(SeuratDisk)


if (!requireNamespace("SeuratDisk", quietly = TRUE)) {
  if (!requireNamespace("remotes", quietly = TRUE)) {
    options(repos=list('CRAN'="https://cloud.r-project.org"))
    install.packages("remotes")
  }
  remotes::install_github("mojaveazure/seurat-disk")
}

# load preprocessed reference
obj <- LoadH5Seurat(file = "data/pbmc_reference/pbmc_multimodal.h5seurat")

# add raw RNA counts
counts <- Matrix::readMM("data/pbmc_reference/GSM5008737_RNA_3P-matrix.mtx.gz")
features <- read.table("data/pbmc_reference/GSM5008737_RNA_3P-features.tsv.gz", sep = "\t")
cells <- read.table("data/pbmc_reference/GSM5008737_RNA_3P-barcodes.tsv.gz", sep = "\t")

rownames(counts) <- features$V1
colnames(counts) <- cells$V1
counts <- as(counts, "dgCMatrix")

obj[["RNA"]] <- CreateAssayObject(counts = counts)
DefaultAssay(obj) <- "RNA"
obj <- FindVariableFeatures(obj)
obj <- NormalizeData(obj)

# compute SCT residuals for reference
obj <- SCTransform(
  object = obj,
  reference.SCT.model = slot(object = obj[['SCT']], name = "SCTModel.list")[[1]],
  residual.features = VariableFeatures(obj[["SCT"]]),
  assay = "RNA",
  new.assay.name = "SCT",
  verbose = FALSE
)
obj$dataset <- "reference"

saveRDS(obj, "objects/pbmc_reference.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
library(Seurat)
library(ggplot2)

args <- commandArgs(trailingOnly = TRUE)
samplename <- as.character(args[[1]])

obj <- readRDS(paste0("objects/", samplename, ".rds"))
if (grepl("reference", samplename)) {
  gb <- 'celltype.l2'
} else {
  annot <- read.table(paste0("annotations/", samplename, ".tsv.gz"), sep = "\t")
  obj <- AddMetaData(obj, annot)
  gb <- 'predicted.id'
}

p <- DimPlot(
  object = obj,
  label = TRUE,
  repel = TRUE,
  group.by = gb,
  pt.size = 0.1
) + NoLegend()

ggsave(
  filename = paste0("plots/", samplename, ".png"),
  plot = p,
  height = 6,
  width = 7
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
library(Signac)
library(Seurat)


args <- commandArgs(trailingOnly = TRUE)
samplename <- as.character(args[[1]])
query <- readRDS(paste0("objects/", samplename, ".rds"))

if (grepl('reference', samplename)) {
  # reference object given as input
  pred <- query[[]]
  write.table(
    x = pred,
    file = paste0("annotations/", samplename, ".tsv"),
    sep = "\t",
    quote = FALSE
  )
} else {
  # map query to reference

  is_multiome <- grepl("multiome", samplename)
  is_pbmc <- grepl("pbmc", samplename)

  if (is_pbmc) {
    reference <- readRDS("objects/pbmc_reference.rds")
  } else {
    reference <- readRDS("objects/bmmc_reference.rds")
  }

  if (is_multiome & is_pbmc) {
    # pbmc multiome
    normalization.method <- "SCT"
    recompute.residuals <- TRUE
    query.assay <- 'RNA'
    reference.assay <- 'SCT'
    reduction <- 'pcaproject'
    weight.reduction <- 'pcaproject'
    wt.dims <- 1:40
  } else if (is_multiome) {
    # bmmc multiome
    normalization.method <- "LogNormalize"
    recompute.residuals <- FALSE
    query.assay <- 'RNA'
    reference.assay <- 'RNA'
    reduction <- 'pcaproject'
    weight.reduction <- 'pcaproject'
    wt.dims <- 1:40
  } else {
    # atac
    normalization.method <- 'LogNormalize'
    recompute.residuals <- FALSE
    query.assay <- 'GA'
    reference.assay <- 'RNA'
    reduction <- 'cca'
    weight.reduction <- query[['lsi']]
    wt.dims <- 2:30
  }

  anchors <- FindTransferAnchors(
    reference = reference,
    query = query,
    normalization.method = normalization.method,
    recompute.residuals = recompute.residuals,
    reference.assay = reference.assay,
    query.assay = query.assay,
    reduction = reduction,
    dims = 1:40
  )

  pred <- TransferData(
    anchorset = anchors,
    refdata = reference$celltype.l2,
    weight.reduction = weight.reduction,
    dims = wt.dims
  )

  write.table(
    x = pred,
    file = paste0("annotations/", samplename, ".tsv"),
    sep = "\t",
    quote = FALSE
  )
}
50
51
52
53
shell:
    """
    wget -i {input} -P data/pbmc_multiome/{wildcards.dset}
    """
60
61
62
63
shell:
    """
    wget -i {input} -P data/pbmc_atac/{wildcards.dset}
    """
70
71
72
73
74
shell:
    """
    wget -i {input} -P data/bmmc_atac
    aws s3 sync s3://mpal-hg38/public/ ./data/bmmc_atac/ --request-payer
    """
81
82
83
84
85
86
shell:
    """
    wget -i {input} -P data/bmmc_multiome
    cd data/bmmc_multiome
    gzip -d GSE194122_openproblems_neurips2021_multiome_BMMC_processed.h5ad.gz
    """
92
93
94
95
shell:
  """
  aws s3 sync s3://openproblems-bio/public/post_competition/multiome ./data/bmmc_multiome
  """
101
102
103
104
shell:
    """
    aws s3 cp s3://bmmc-reference/public/fullref.Rds {output} --request-payer
    """
SnakeMake From line 101 of master/Snakefile
111
112
113
114
shell:
    """
    wget -i {input} -P data/{wildcards.dset}
    """
SnakeMake From line 111 of master/Snakefile
120
shell: "wget https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_10k/pbmc_granulocyte_sorted_10k_atac_peaks.bed -O {output}"
SnakeMake From line 120 of master/Snakefile
136
137
138
139
shell:
  """
  python processing_code/convert_h5ad.py data/bmmc_multiome
  """
SnakeMake From line 136 of master/Snakefile
153
154
155
156
shell:
    """
    Rscript processing_code/bmmc_multiome.R
    """
SnakeMake From line 153 of master/Snakefile
166
167
168
169
shell:
    """
    Rscript processing_code/bmmc_atac.R
    """
SnakeMake From line 166 of master/Snakefile
176
177
178
179
shell:
    """
    Rscript processing_code/bmmc_reference.R
    """
SnakeMake From line 176 of master/Snakefile
186
shell: "Rscript processing_code/get_annotations.R"
SnakeMake From line 186 of master/Snakefile
193
shell: "Rscript processing_code/pbmc_reference.R"
SnakeMake From line 193 of master/Snakefile
204
205
206
207
208
209
210
211
212
shell:
    """
    Rscript processing_code/pbmc_multiome.R \
            {threads} \
            {input.annot} \
            {input.peaks} \
            {wildcards.sample} \
            {input.ref}
    """
SnakeMake From line 204 of master/Snakefile
222
223
224
225
226
227
228
229
shell:
    """
    Rscript processing_code/pbmc_atac.R \
            {threads} \
            {input.annot} \
            {input.peaks} \
            {wildcards.sample} 
    """
SnakeMake From line 222 of master/Snakefile
241
242
243
244
245
shell:
    """
    Rscript processing_code/refmap.R {wildcards.sample}
    gzip annotations/{wildcards.sample}.tsv
    """
SnakeMake From line 241 of master/Snakefile
254
255
256
257
shell:
    """
    Rscript processing_code/plot.R {wildcards.sample}
    """
SnakeMake From line 254 of master/Snakefile
ShowHide 22 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/timoast/epi-immune
Name: epi-immune
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...