BioWorkflows: Code to reproduce analyses shown in the Signac paper

library(Signac)
library(Seurat)
library(ggplot2)
library(patchwork)
library(dplyr)
library(tidyr)
library(BSgenome.Hsapiens.UCSC.hg38)

pbmc <- readRDS("objects/pbmc.rds")
lnk <- readRDS("objects/pbmc_links.rds")

DefaultAssay(pbmc) <- "ATAC"
Links(pbmc) <- lnk

# ----- QC plots -----

nucleosome_plot <- FragmentHistogram(pbmc, group.by = "orig.ident") +
  ggtitle("Nucleosome signal")

tss_plot <- TSSPlot(pbmc, assay = "cellranger", group.by = "orig.ident")

saveRDS(object = nucleosome_plot, file = "figures/nucleosome_signal.rds")
saveRDS(object = tss_plot, file = "figures/tss_enrichment.rds")

# ----- Dim plots ----- 

rna_dimplot <- DimPlot(pbmc, reduction = "umap.rna", label = TRUE, repel = TRUE)
atac_dimplot <- DimPlot(pbmc, reduction = "umap.atac", label = TRUE, repel = TRUE)

saveRDS(object = rna_dimplot, file = "figures/pbmc_rna_dimplot.rds")
saveRDS(object = atac_dimplot, file = "figures/pbmc_atac_dimplot.rds")

# ----- Markers -----

markers <- FindMarkers(
  object = pbmc,
  ident.1 = "CD8 TEM",
  ident.2 = "CD8 Naive",
  test.use = "LR",
  latent.vars = "nCount_ATAC",
  only.pos = TRUE
)

top.markers <- markers[markers$p_val_adj < 0.01 & markers$avg_log2FC > 0.4, ]

motifs <- FindMotifs(
  object = pbmc,
  features = rownames(top.markers),
  features.match = c("GC.percent", "count", "sequence.length")
)

# EOMES, TBX21, TBX2 equally enriched in effector T cell peaks
# look at RNA data to see which is expressed
# compare with chromvar deviations

DefaultAssay(pbmc) <- "RNA"
tf_use <- c("EOMES", "TBX21", "TBX2")

tf_expression <- VlnPlot(
  object = pbmc,
  features = tf_use,
  idents = c("CD8 TEM", "CD8 Naive"),
  pt.size = 0
) & ylim(c(0, 2.5)) & ylab("RNA expression") & ggtitle("") & xlab("")

DefaultAssay(pbmc) <- "chromvar"

tf_chromvar <- lapply(X = seq_along(tf_use), function(x) {
  VlnPlot(
    object = pbmc,
    features = motifs$motif[x],
    idents = c("CD8 TEM", "CD8 Naive"),
    pt.size = 0
  ) + ggtitle(tf_use[x]) + NoLegend() + ylab("chromVAR deviation") +
    xlab("") + theme(axis.text.x = element_blank())
})

tf_chromvar <- wrap_plots(tf_chromvar, ncol = 3)

DefaultAssay(pbmc) <- "ATAC"

pbmc <- Footprint(
  object = pbmc,
  motif.name = c("EOMES", "TBX21"),
  genome = BSgenome.Hsapiens.UCSC.hg38
)

fp <- PlotFootprint(
  object = pbmc,
  features = c("EOMES", "TBX21"),
  idents = c("CD8 TEM", "CD8 Naive")
) & NoLegend() & plot_layout(ncol = 1)

mp <- MotifPlot(pbmc, head(motifs$motif, 3))

saveRDS(object = mp, file = "figures/motifplot.rds")
saveRDS(object = tf_chromvar, file = "figures/chromvar_vln.rds")
saveRDS(object = tf_expression, file = "figures/tf_rna_vln.rds")
saveRDS(object = fp, file = "figures/footprint.rds")

# ----- Coverage plot -----

covplot <- CoveragePlot(
  object = pbmc,
  idents = c("CD4 Naive", "CD4 TCM", "CD8 Naive",
             "CD8 TEM", "MAIT", "NK", "Treg"),
  region = "CD8A",
  features = "CD8A",
  expression.assay = "RNA",
  extend.upstream = 2000,
  extend.downstream = 2000,
  links = FALSE
)

saveRDS(object = covplot, file = "figures/coverage_plot.rds")

# ----- Link analysis -----

# ratio of positive to negative links
sum(lnk$score < 0) / length(lnk) * 100

# total over 100 kb
sum(width(lnk) > 100000) / length(lnk)

# number of links per gene (regulatory complexity)
# compare cell-type-specific vs houskeeping genes

# for each gene, find the number of linked peaks
link.df <- as.data.frame(lnk)

links_per_gene <- link.df %>% 
  mutate(pos_link = score > 0) %>% 
  group_by(gene) %>% 
  summarise(positive_links = sum(pos_link), negative_links = sum(!pos_link))

mean(links_per_gene$positive_links + links_per_gene$negative_links)
# 6.373724

sd(links_per_gene$positive_links + links_per_gene$negative_links)
# 7.110643

# total links per gene
link_per_gene_plot <- links_per_gene %>%
  group_by(positive_links, negative_links) %>%
  summarise(count = n()) %>% 
  ggplot(data = ., aes(x = positive_links, y = negative_links, fill = log10(count+1))) +
  geom_tile() +
  theme_bw() +
  scale_fill_viridis_c() +
  ylab("Total negative links") +
  xlab("Total positive links") +
  ggtitle("Number of linked peaks per gene")

# number of linked genes per peak
genes_per_link <- link.df %>% 
  mutate(pos_link = score > 0) %>% 
  group_by(peak) %>% 
  summarise(positive_links = sum(pos_link), negative_links = sum(!pos_link))

mean(genes_per_link$positive_links + genes_per_link$negative_links)
# 1.578854

sd(genes_per_link$positive_links + genes_per_link$negative_links)
# 1.259847

# total links per gene
gene_per_link_plot <- genes_per_link %>%
  group_by(positive_links, negative_links) %>%
  summarise(count = n()) %>% 
  ggplot(data = ., aes(x = positive_links, y = negative_links, fill = log10(count+1))) +
  geom_tile() +
  theme_bw() +
  scale_x_continuous(breaks = 0:10) +
  scale_y_continuous(breaks = 0:10) +
  scale_fill_viridis_c() +
  ylab("Total negative links") +
  xlab("Total positive links") +
  ggtitle("Number of linked genes per peak")

# distance from peak to tss
p1 <- ggplot(data = link.df[link.df$score > 0, ], aes(x = width)) +
  geom_histogram(bins = 100) +
  theme_classic() +
  xlab("") +
  ylab("Count") +
  ggtitle("Positive gene associations")

p2 <- ggplot(data = link.df[link.df$score < 0, ], aes(x = width)) +
  geom_histogram(bins = 100) +
  theme_classic() +
  xlab("Distance to gene TSS (bp)") +
  ylab("Count") +
  ggtitle("Negative gene associations")

p3 <- ggplot(data = link.df, mapping = aes(x = pvalue)) +
  geom_histogram(bins = 100) +
  theme_classic() +
  xlab("p-value") +
  ylab("Count") +
  ggtitle("p-value distribution")

saveRDS(object = p1, file = "figures/distance_positive.rds")
saveRDS(object = p2, file = "figures/distance_negative.rds")
saveRDS(object = p3, file = "figures/link_pvals.rds")
saveRDS(object = gene_per_link_plot, file = "figures/genes_per_link_plot.rds")
saveRDS(object = link_per_gene_plot, file = "figures/link_per_gene_plot.rds")

# ----- Link plots ----- 

linked_1 <- CoveragePlot(
  object = pbmc,
  region = "MS4A1",
  features = "MS4A1",
  idents = c("B naive", "B intermediate", "B memory", "CD14 Mono", "CD16 Mono", "CD8 TEM", "CD8 Naive"),
  extend.upstream = 500,
  extend.downstream = 10000
)

linked_2 <- CoveragePlot(
  object = pbmc,
  region = "LYZ",
  features = "LYZ",
  idents = c("B naive", "B intermediate", "B memory", "CD14 Mono", "CD16 Mono", "CD8 TEM", "CD8 Naive"),
  extend.upstream = 5000,
  extend.downstream = 5000
)

saveRDS(object = linked_1, file = "figures/linked_covplot1.rds")
saveRDS(object = linked_2, file = "figures/linked_covplot2.rds")

# ----- Peak calling comparison ----- 

DefaultAssay(pbmc) <- "cellranger"

# call MACS2 peaks on pseudobulk
pks <- CallPeaks(
  object = pbmc,
  macs2.path = "/home/stuartt/miniconda3/envs/signac/bin/macs2",
  additional.args = "--max-gap 50"
)

# example where cellranger incorrectly merges peaks
cp <- CoveragePlot(
  object = pbmc,
  region = "CD8A",
  ranges = pks,
  ranges.title = "MACS2",
  extend.upstream = 2000,
  extend.downstream = 2000,
  links = FALSE
)
saveRDS(object = cp, file = "figures/cellranger_peakcalling.rds")

DefaultAssay(pbmc) <- "ATAC"

# example where celltype specific peaks missed (need to also run MACS2 on bulk and compare)
# find markers for a rare population

mrk_cd56 <- FindMarkers(
  object = pbmc,
  ident.1 = "NK_CD56bright",
  ident.2 = "NK",
  latent.vars = "nCount_ATAC",
  test.use = "LR",
  only.pos = TRUE
)

all.markers <- FindAllMarkers(
  object = pbmc,
  test.use = "LR",
  latent.vars = "nCount_ATAC",
  only.pos = TRUE
)

all.markers$isunique <- Biobase::isUnique(all.markers$gene)
all.unique.markers <- all.markers[all.markers$isunique, ]

n_celltype <- table(Idents(pbmc))
fraction_recovered <- vector(mode = 'numeric', length = length(n_celltype))
for (i in seq_along(n_celltype)) {
  celltype <- names(n_celltype)[[i]]
  markers.use <- all.unique.markers[all.unique.markers$cluster == celltype, ]
  markers.ranges <- StringToGRanges(markers.use$gene)
  frac_recovered <- sum(countOverlaps(query = markers.ranges, subject = pks) > 0) / length(markers.ranges)
  fraction_recovered[[i]] <- frac_recovered
}

df <- data.frame(n_cells = n_celltype, fraction_recovered = fraction_recovered)

missed_peak_count <- ggplot(df, aes(n_celltype, fraction_recovered)) +
  geom_point() +
  theme_classic() +
  ylab("Fraction of cell-type-specific peaks identified") +
  xlab("Number of cells")

missed <- CoveragePlot(
  object = pbmc,
  region = "chr19-3805000-3806000",
  ranges.title = "Bulk",
  ranges = pks,
  extend.upstream = 5000,
  extend.downstream = 5000,
  links = FALSE
)

saveRDS(object = missed_peak_count, file = "figures/missed_peak_count.rds")
saveRDS(object = missed, file = "figures/macs2_pseudobulk.rds")

# get average number of macs2 peaks overlapped by cellranger peak
olap <- findOverlaps(query = pks, subject = pbmc[["cellranger"]])

sum(table(subjectHits(olap)) > 1) # 13751
sum(table(queryHits(olap)) > 1)   # 2

# call peaks per DNA accessibility cluster
cluster_peaks <- CallPeaks(
  object = pbmc,
  group.by = "cellranger_snn_res.0.8",
  additional.args = "--max-gap 50",
  macs2.path = "/home/stuartt/miniconda3/envs/signac/bin/macs2"
)

# check overlap with cell-type specific peaks
olap <- findOverlaps(query = pbmc[["ATAC"]], subject = cluster_peaks)
sum(table(subjectHits(olap)) >= 1)  / length(granges(pbmc[["ATAC"]])) # 0.9270874

sum(table(subjectHits(olap)) > 1) # 1059
sum(table(queryHits(olap)) > 1)   # 2497
length(cluster_peaks) # 152473
nrow(pbmc[["ATAC"]])  # 155611

olap <- findOverlaps(query = pbmc[["ATAC"]], subject = pks)
sum(table(subjectHits(olap)) >= 1)  / length(granges(pbmc[["ATAC"]])) # 0.7859085

sum(table(subjectHits(olap)) > 1) # 3529
sum(table(queryHits(olap)) > 1)   # 1310
length(pks) # 134195
nrow(pbmc[["ATAC"]])  # 155611

# DimPlot showing ATAC clusters
cluster_dimplot <- DimPlot(pbmc, group.by = "cellranger_snn_res.0.8", label = TRUE, repel = TRUE, reduction = "umap.atac") +
  ggtitle("scATAC-seq cell clusters") + theme(legend.position = "none") +
  xlab("UMAP 1") + ylab("UMAP 2")

ggsave(filename = "figures/atac_cluster_dimplot.png", plot = cluster_dimplot, height = 8, width = 8, dpi = 400)
saveRDS(cluster_dimplot, "figures/atac_cluster_dimplot.rds")

## ------- Multimodal label transfer ----------

pbmc.atac <- readRDS("objects/multimodal_label_transfer.rds")

# high-res prediction
pbmc.atac$predicted.id <- factor(pbmc.atac$predicted.id, levels = levels(pbmc.atac$gt))
pbmc.atac$annotation_correct <- pbmc.atac$predicted.id == pbmc.atac$gt
p1 <- DimPlot(pbmc.atac, group.by = "gt", label = TRUE, repel = TRUE, reduction = "umap.atac") + NoLegend() + ggtitle("Ground-truth annotation")
p2 <- DimPlot(pbmc.atac, group.by = "predicted.id", label = TRUE, repel = TRUE, reduction = "umap.atac") + NoLegend() + ggtitle("Predicted annotation")

predictions <- table(pbmc.atac$gt, pbmc.atac$predicted.id)
predictions <- predictions/rowSums(predictions)  # normalize for number of cells in each cell type
predictions <- as.data.frame(predictions)
p3 <- ggplot(predictions, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  scale_fill_viridis_c() +
  xlab("Annotated cell type (RNA)") +
  ylab("Predicted cell type (ATAC)") + 
  labs(fill = "Fraction of cells") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

incorrect <- length(which(pbmc.atac$gt != pbmc.atac$predicted.id))
1 - (incorrect / ncol(pbmc.atac)) # 0.870151

data <- FetchData(pbmc.atac, vars = c("prediction.score.max", "annotation_correct"))
p4 <- ggplot(data, aes(prediction.score.max, fill = annotation_correct, colour = annotation_correct)) + 
  geom_histogram() +
  facet_wrap(~annotation_correct) +
  xlab("Prediction Score") +
  theme_bw()

ggsave(filename = "figures/label_transfer_accuracy.pdf", plot = p3, height = 4.5, width = 6)
saveRDS(object = p3, file = "figures/label_transfer_accuracy.rds")
(p1 | p2 | p4) + ggsave("figures/multimodal_label_transfer.png", height = 8, width = 18)

# coarse prediction
incorrect <- length(which(pbmc.atac$coarse_celltype != pbmc.atac$coarse_predicted))
1 - (incorrect / ncol(pbmc.atac)) # 0.925473

R ggplot2 dplyr tidyr macs2 Seurat BSgenome.Hsapiens.UCSC.hg38 patcHwork Signac From line 1 of code/analyze_pbmc.R

declare -a ncell=("50000" "100000" "200000" "300000" "400000" "500000" "600000" "700000")
declare -a ncore=("1" "2" "4" "8")

[ -d data/biccn/benchmarks ] || mkdir data/biccn/benchmarks

# feature matrix
for i in "${ncell[@]}"; do
  # need to run cores separately since it doesn't seem to obey the set number of threads
  taskset --cpu-list 1 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      1 \
      archr_biccn/$i \
      data/biccn/unified_peaks.bed \
      3 \
      data/biccn/benchmarks/archr_featmat_runtime_${i}_1.txt \
      "mm10"

  taskset --cpu-list 1,2 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      2 \
      archr_biccn/$i \
      data/biccn/unified_peaks.bed \
      3 \
      data/biccn/benchmarks/archr_featmat_runtime_${i}_2.txt \
      "mm10"

  taskset --cpu-list 1,2,3,4 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      4 \
      archr_biccn/$i \
      data/biccn/unified_peaks.bed \
      3 \
      data/biccn/benchmarks/archr_featmat_runtime_${i}_4.txt \
      "mm10"

  taskset --cpu-list 1,2,3,4,5,6,7,8 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      8 \
      archr_biccn/$i \
      data/biccn/unified_peaks.bed \
      3 \
      data/biccn/benchmarks/archr_featmat_runtime_${i}_8.txt \
      "mm10"
done

for i in "${ncell[@]}"; do
  # need to run cores separately since it doesn't seem to obey the set number of threads
  taskset --cpu-list 1 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      1 \
      archr_biccn/$i \
      3 \
      data/biccn/benchmarks/archr_geneactivity_runtime_${i}_1.txt \
      "mm10"

  taskset --cpu-list 1,2 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      2 \
      archr_biccn/$i \
      3 \
      data/biccn/benchmarks/archr_geneactivity_runtime_${i}_2.txt \
      "mm10"

  taskset --cpu-list 1,2,3,4 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      4 \
      archr_biccn/$i \
      3 \
      data/biccn/benchmarks/archr_geneactivity_runtime_${i}_4.txt \
      "mm10"

  taskset --cpu-list 1,2,3,4,5,6,7,8 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      8 \
      archr_biccn/$i \
      3 \
      data/biccn/benchmarks/archr_geneactivity_runtime_${i}_8.txt \
      "mm10"
done

# lsi
for i in "${ncell[@]}"; do
    taskset --cpu-list 1 Rscript --vanilla code/downsampling_code/run_archr_lsi.R \
    archr_biccn/$i \
    data/biccn/unified_peaks.bed \
    1 \
    data/biccn/benchmarks/archr_lsi_runtime_${i}.txt \
    "mm10"
done

Shell From line 3 of biccn_downsampling/benchmark_archr.sh

declare -a ncell=("50000" "100000" "200000" "300000" "400000" "500000" "600000" "700000")
declare -a ncore=("1" "2" "4" "8")

[ -d data/biccn/benchmarks ] || mkdir data/biccn/benchmarks
[ -d data/biccn/downsampling ] || mkdir data/biccn/downsampling

# run each step with different numbers of cores, profile max memory usage

# feature matrix
for i in "${ncell[@]}"; do
  for j in "${ncore[@]}"; do
    /usr/bin/time -o data/biccn/benchmarks/featmat_mem_${i}_${j}.txt \
      -v Rscript --vanilla code/downsampling_code/run_featurematrix.R \
      $j \
      /scratch/tim/biccn/downsampling/$i.rds \
      data/biccn/unified_peaks.bed \
      3 \
      data/biccn/benchmarks/featmat_runtime_${i}_${j}.txt \
      data/biccn/downsampling/counts_${i}.rds
  done
done

# nucleosome signal
for i in "${ncell[@]}"; do
  /usr/bin/time -o data/biccn/benchmarks/nucleosome_mem_${i}.txt \
    -v Rscript --vanilla code/downsampling_code/run_nucleosome.R \
    data/biccn/downsampling/counts_${i}.rds \
    /scratch/tim/biccn/downsampling/$i.rds \
    data/biccn/annotations.rds \
    3 \
    data/biccn/benchmarks/nucleosome_runtime_${i}.txt \
    data/biccn/downsampling/nucleosome_${i}.rds
done


# tss enrichment
for i in "${ncell[@]}"; do
  for j in "${ncore[@]}"; do
    /usr/bin/time -o data/biccn/benchmarks/tss_mem_${i}_${j}.txt \
      -v Rscript --vanilla code/downsampling_code/run_tss.R \
      data/biccn/downsampling/nucleosome_${i}.rds \
      3 \
      data/biccn/benchmarks/tss_runtime_${i}_${j}.txt \
      data/biccn/downsampling/tss_${i}.rds \
      $j
  done
done

# gene activity matrix
for i in "${ncell[@]}"; do
  for j in "${ncore[@]}"; do
    /usr/bin/time -o data/biccn/benchmarks/ga_mem_${i}_${j}.txt \
      -v Rscript --vanilla code/downsampling_code/run_gene_activity.R \
      data/biccn/downsampling/tss_${i}.rds \
      $j \
      3 \
      data/biccn/benchmarks/ga_runtime_${i}_${j}.txt
  done
done

# tf-idf
for i in "${ncell[@]}"; do
  /usr/bin/time -o data/biccn/benchmarks/tfidf_mem_${i}.txt \
    -v Rscript --vanilla code/downsampling_code/run_tfidf.R \
    data/biccn/downsampling/tss_${i}.rds \
    3 \
    data/biccn/benchmarks/tfidf_runtime_${i}.txt \
    data/biccn/downsampling/tfidf_${i}.rds
done

# svd
for i in "${ncell[@]}"; do
  /usr/bin/time -o data/biccn/benchmarks/svd_mem_${i}.txt \
    -v Rscript --vanilla code/downsampling_code/run_svd.R \
    data/biccn/downsampling/tfidf_${i}.rds \
    3 \
    data/biccn/benchmarks/svd_runtime_${i}.txt \
    data/biccn/downsampling/svd_${i}.rds
done

Shell From line 3 of biccn_downsampling/benchmark.sh

downsamples <- c('50000', '100000', '200000', '300000', '400000', '500000', '600000', '700000')
cores <- c(1, 2, 4, 8)

results_df <- data.frame()

runtime <- read.table(file = "data/biccn/benchmarks/signac_object_creation.tsv", sep = "\t")
runtime_archr <- read.table(file = "data/biccn/benchmarks/archr_object_creation.tsv", sep = "\t")
runtime_archr <- runtime_archr[runtime_archr$V2 == "Arrow", ]
result <- data.frame(
  "Cells" = c(runtime$V4, runtime_archr$V4),
  "Cores" = 1,
  "Step" = "Create",
  "Runtime" = c(runtime$V1, runtime_archr$V1),
  "Method" = c(rep("Signac", nrow(runtime)), rep("ArchR", nrow(runtime_archr)))
)
results_df <- rbind(results_df, result)

for (i in downsamples) {
  for (j in cores) {
    runtime <- readLines(con = paste0("data/biccn/benchmarks/featmat_runtime_", i, "_", j, ".txt"))
    rt_archr <- readLines(con = paste0("data/biccn/benchmarks/archr_featmat_runtime_", i, "_", j, ".txt"))
    runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
    rt_archr <- sapply(rt_archr, as.numeric, USE.NAMES = FALSE)
    result <- data.frame(
      "Cells" = i,
      "Cores" = j,
      "Step" = "FeatureMatrix",
      "Runtime" = c(runtime, rt_archr),
      "Method" = c(rep("Signac", length(runtime)), rep("ArchR", length(rt_archr)))
    )
    results_df <- rbind(results_df, result)
  }
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/biccn/benchmarks/nucleosome_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "NucleosomeSignal",
    "Runtime" = runtime,
    "Method" = "Signac"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  for (j in cores) {
    runtime <- readLines(con = paste0("data/biccn/benchmarks/ga_runtime_", i, "_", j, ".txt"))
    rt_archr <- readLines(con = paste0("data/biccn/benchmarks/archr_geneactivity_runtime_", i, "_", j, ".txt"))
    runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
    rt_archr <- sapply(rt_archr, as.numeric, USE.NAMES = FALSE)
    result <- data.frame(
      "Cells" = i,
      "Cores" = j,
      "Step" = "GeneActivity",
      "Runtime" = c(runtime, rt_archr),
      "Method" = c(rep("Signac", length(runtime)), rep("ArchR", length(rt_archr)))
    )
    results_df <- rbind(results_df, result)
  }
}

for (i in downsamples) {
  for (j in cores) {
    runtime <- readLines(con = paste0("data/biccn/benchmarks/tss_runtime_", i, "_", j, ".txt"))
    runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
    result <- data.frame(
      "Cells" = i,
      "Cores" = j,
      "Step" = "TSSEnrichment",
      "Runtime" = runtime,
      "Method" = "Signac"
    )
    results_df <- rbind(results_df, result)
  }
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/biccn/benchmarks/tfidf_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "RunTFIDF",
    "Runtime" = runtime,
    "Method" = "Signac"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/biccn/benchmarks/svd_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "RunSVD",
    "Runtime" = runtime,
    "Method" = "Signac"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/biccn/benchmarks/archr_lsi_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "LSI",
    "Runtime" = runtime,
    "Method" = "ArchR"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/biccn/benchmarks/archr_est_lsi_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "estLSI",
    "Runtime" = runtime,
    "Method" = "ArchR"
  )
  results_df <- rbind(results_df, result)
}

# add LSI
tfidf <- results_df[results_df$Step == "RunTFIDF", ]
runsvd <- results_df[results_df$Step == "RunSVD", ]
lsi <- runsvd
lsi$Runtime <- runsvd$Runtime + tfidf$Runtime
lsi$Step <- "LSI"
results_df <- rbind(results_df, lsi)

results_df$Cells <- as.numeric(results_df$Cells)
results_df$Cores <- as.factor(results_df$Cores)

write.table(x = results_df, file = "data/biccn/timings.tsv")

R From line 1 of biccn_downsampling/collate_timings.R

library(Seurat)
library(Signac)
library(ggplot2)
library(mclust)


pbmc <- readRDS("objects/pbmc.rds")

# sweep clustering parameters
k.param <- seq(5, 50, 5)
dims.param <- seq(10, 50, 5)
celltypes <- pbmc$celltype

cluster.results <- data.frame()
for (i in seq_along(k.param)) {
  for (j in seq_along(dims.param)) {
    pbmc <- FindNeighbors(pbmc, reduction = "lsi", dims = 2:dims.param[[j]], k.param = k.param[[i]])
    pbmc <- FindClusters(pbmc, algorithm = 3, graph.name = "ATAC_snn")
    ari <- adjustedRandIndex(x = celltypes, y = pbmc$seurat_clusters)
    cluster.results <- rbind(cluster.results, data.frame(dims = dims.param[[j]],
                                       k = k.param[[i]],
                                       ari = ari))
  }
}

p <- ggplot(cluster.results, aes(dims, k, fill = ari)) +
  geom_tile() +
  scale_fill_viridis_c() +
  ylab("Number of nearest neighbors (k)") +
  xlab("LSI dimensions (2:n)") + 
  labs(fill = "Adjusted Rand Index") +
  theme_classic()

ggsave(filename = "figures/cluster_param_sweep.png", plot = p, height = 4, width = 7)

R ggplot2 Seurat Signac mclust From line 1 of code/clustering.R

library(ArchR)
library(GenomicRanges)

# load metadata
metadata <- read.table("data/biccn/Supplementary Table 2 - Metatable of nuclei.tsv", sep="\t", skip=1)
rownames(metadata) <- metadata$V1
colnames(metadata) <- c("cell", "sample", "barcode", "logUM", "TSSe", "class", "MajorType", "SubType", "na")
cells <- metadata$cell

frags <- "data/biccn/fragments.bed.gz"
peaks <- read.table(file = "data/biccn/unified_peaks.bed", sep = "\t", header = TRUE)
peaks <- makeGRangesFromDataFrame(peaks)
# remove chrM
peaks <- peaks[seqnames(peaks) != "chrM"]
message("Using ", length(peaks), " peaks")

addArchRThreads(threads = 8)
addArchRGenome("mm10")

start.time <- Sys.time()
ArrowFiles <- createArrowFiles(
  inputFiles = frags,
  sampleNames = "BICCN",
  validBarcodes = cells,
  force = TRUE,
  minFrags = 1,
  addGeneScoreMat = FALSE,
  addTileMat = FALSE
)

proj <- ArchRProject(
  ArrowFiles = ArrowFiles,
  copyArrows = TRUE,
  showLogo = FALSE
)

proj <- addPeakSet(ArchRProj = proj, peakSet = peaks, force = TRUE)
proj <- addPeakMatrix(ArchRProj = proj, force = TRUE)
proj <- addIterativeLSI(ArchRProj = proj, useMatrix = "PeakMatrix", force = TRUE)
proj <- addClusters(input = proj, force = TRUE, dimsToUse = 2:100)
proj <- addUMAP(ArchRProj = proj, force = TRUE, dimsToUse = 2:100)

elapsed <- as.numeric(Sys.time() - start.time, units = "secs")
writeLines(text = as.character(elapsed), con = "data/biccn/archr_total_runtime.txt")

R GenomicRanges From line 1 of code/create_biccn_archr.R

library(Signac)
library(Seurat)
library(GenomicRanges)
library(future)

plan("multicore", workers = 8)
options(future.globals.maxSize = +Inf)

annot <- readRDS("data/biccn/annotations.rds")
# load metadata
metadata <- read.table("data/biccn/Supplementary Table 2 - Metatable of nuclei.tsv", sep="\t", skip=1)
rownames(metadata) <- metadata$V1
colnames(metadata) <- c("cell", "sample", "barcode", "logUM", "TSSe", "class", "MajorType", "SubType", "na")
cells <- metadata$cell

frags <- "data/biccn/fragments.bed.gz"
peaks <- read.table(file = "data/biccn/unified_peaks.bed", sep = "\t", header = TRUE)
peaks <- makeGRangesFromDataFrame(peaks)

start.time <- Sys.time()
fragments <- CreateFragmentObject(
  path = frags,
  cells = cells
)

# quantify
counts <- FeatureMatrix(
  fragments = fragments,
  features = peaks,
  cells = cells
)

# create object
assay <- CreateChromatinAssay(counts = counts, fragments = fragments, annotation = annot)
obj <- CreateSeuratObject(counts = assay, assay = "ATAC")

gc()

# QC
obj <- NucleosomeSignal(obj)
obj <- TSSEnrichment(obj)

# LSI
obj <- FindTopFeatures(obj)
obj <- RunTFIDF(obj)
obj <- RunSVD(obj)

# clustering
obj <- FindNeighbors(obj, reduction = "lsi", dims = 2:100)
obj <- FindClusters(obj)

# UMAP
obj <- RunUMAP(obj, reduction = "lsi", dims = 2:100)

elapsed <- as.numeric(Sys.time() - start.time, units = "secs")
writeLines(text = as.character(elapsed), con = "data/biccn/signac_total_runtime.txt")

R Seurat GenomicRanges Signac future From line 1 of code/create_biccn_signac.R

library(ArchR)
library(GenomicRanges)

cells <- readLines("data/pbmc_atac/cells.txt")
frags <- "data/pbmc_atac/fragments.bed.gz"
peaks <- read.table(file = "data/pbmc_atac/peaks.bed", sep = "\t", header = TRUE)
peaks <- makeGRangesFromDataFrame(peaks)

addArchRThreads(threads = 8)
addArchRGenome("hg19")

start.time <- Sys.time()
ArrowFiles <- createArrowFiles(
  inputFiles = frags,
  sampleNames = "PBMC",
  validBarcodes = cells,
  force = TRUE,
  minFrags = 1,
  addGeneScoreMat = FALSE,
  addTileMat = FALSE
)

proj <- ArchRProject(
  ArrowFiles = ArrowFiles,
  copyArrows = TRUE,
  showLogo = FALSE
)

proj <- addPeakSet(ArchRProj = proj, peakSet = peaks, force = TRUE)
proj <- addPeakMatrix(ArchRProj = proj, force = TRUE)
proj <- addIterativeLSI(ArchRProj = proj, useMatrix = "PeakMatrix", sampleCellsPre = NULL, force = TRUE)
proj <- addClusters(input = proj, force = TRUE)
proj <- addUMAP(ArchRProj = proj, force = TRUE)

elapsed <- as.numeric(Sys.time() - start.time, units = "secs")
writeLines(text = as.character(elapsed), con = "data/pbmc_atac/archr_total_runtime.txt")

R GenomicRanges From line 1 of code/create_pbmc_atac_archr.R

library(Signac)
library(Seurat)
library(GenomicRanges)
library(future)

plan("multiprocess", workers = 8)
options(future.globals.maxSize = 50 * 1024 ^ 3)

annot <- readRDS("data/pbmc_atac/annotations.rds")
cells <- readLines("data/pbmc_atac/cells.txt")
frags <- "data/pbmc_atac/fragments.bed.gz"
peaks <- read.table(file = "data/pbmc_atac/peaks.bed", sep = "\t", header = TRUE)
peaks <- makeGRangesFromDataFrame(peaks)

start.time <- Sys.time()
fragments <- CreateFragmentObject(
  path = frags,
  cells = cells
)

# quantify
counts <- FeatureMatrix(
  fragments = fragments,
  features = peaks,
  cells = cells
)

# create object
assay <- CreateChromatinAssay(counts = counts, fragments = fragments, annotation = annot)
obj <- CreateSeuratObject(counts = assay, assay = "ATAC")

# QC
obj <- NucleosomeSignal(obj)
obj <- TSSEnrichment(obj)

# LSI
obj <- FindTopFeatures(obj)
obj <- RunTFIDF(obj)
obj <- RunSVD(obj)

# clustering
obj <- FindNeighbors(obj, reduction = "lsi", dims = 2:30)
obj <- FindClusters(obj)

# UMAP
obj <- RunUMAP(obj, reduction = "lsi", dims = 2:30)

elapsed <- as.numeric(Sys.time() - start.time, units = "secs")
writeLines(text = as.character(elapsed), con = "data/pbmc_atac/signac_total_runtime.txt")

R Seurat GenomicRanges Signac future From line 1 of code/create_pbmc_atac_signac.R

library(ArchR)
library(Seurat)
library(Signac)
set.seed(1234)

# create object from each downsampled fragment file
options(scipen=999)
downsamples_biccn <- c(50000, 100000, 200000, 300000, 400000, 500000, 600000, 700000)
downsamples_pbmc <- seq(from = 1000, to = 26000, by = 2000)

addArchRThreads(threads = 1)
addArchRGenome("hg19")

for (i in downsamples_pbmc) {
  # load the signac fragment file to get list of cells to include
  frags <- readRDS(paste0("/scratch/tim/pbmc_atac/downsampling/", i, ".rds"))
  cells <- Cells(frags)
  start.time <- Sys.time()
  ArrowFiles <- createArrowFiles(
    inputFiles = paste0("/scratch/tim/pbmc_atac/downsampling/", i, ".bed.gz"),
    sampleNames = paste0("pbmc_", i),
    validBarcodes = cells,
    excludeChr = "",
    force = TRUE,
    minFrags = 1,
    addGeneScoreMat = FALSE,
    addTileMat = FALSE
  )
  elapsed.arrow <- as.numeric(Sys.time() - start.time, units = "secs")
  start.time <- Sys.time()
  proj <- ArchRProject(
    ArrowFiles = ArrowFiles,
    copyArrows = FALSE,
    showLogo = FALSE
  )
  elapsed.proj <- as.numeric(Sys.time() - start.time, units = "secs")
  saveArchRProject(ArchRProj = proj, outputDirectory = paste0("archr_pbmc/", i))
  # save timing
  write(
    x = paste0(elapsed.arrow, "\tArrow\tPBMC\t", i, "\n", elapsed.proj, "\tProject\tPBMC\t", i),
    file = "data/pbmc_atac/benchmarks/archr_object_creation.tsv",
    append = TRUE
  )
}

addArchRGenome("mm10")

for (i in downsamples_biccn) {
  # load the signac fragment file to get list of cells to include
  frags <- readRDS(paste0("/scratch/tim/biccn/downsampling/", i, ".rds"))
  cells <- Cells(frags)
  start.time <- Sys.time()
  ArrowFiles <- createArrowFiles(
    inputFiles = paste0("/scratch/tim//biccn/downsampling/", i, ".bed.gz"),
    sampleNames = paste0("biccn_", i),
    validBarcodes = cells,
    force = TRUE,
    excludeChr = "",
    minFrags = 1,
    addGeneScoreMat = FALSE,
    addTileMat = FALSE
  )
  elapsed.arrow <- as.numeric(Sys.time() - start.time, units = "secs")
  start.time <- Sys.time()
  proj <- ArchRProject(
    ArrowFiles = ArrowFiles,
    copyArrows = FALSE,
    showLogo = FALSE
  )
  elapsed.proj <- as.numeric(Sys.time() - start.time, units = "secs")
  saveArchRProject(ArchRProj = proj, outputDirectory = paste0("archr_biccn/", i))
  # save timing
  write(
    x = paste0(elapsed.arrow, "\tArrow\tBICCN\t", i, "\n", elapsed.proj, "\tProject\tBICCN\t", i),
    file = "data/biccn/benchmarks/archr_object_creation.tsv",
    append = TRUE
  )
}

R Seurat Signac From line 1 of downsampling_code/downsample_archr.R

library(Signac)
set.seed(1234)

# load the full dataset
biccn <- readRDS("objects/biccn.rds")
pbmc <- readRDS("objects/pbmc_atac.rds")

# randomly sample different numbers of cells
downsamples_biccn <- c(50000, 100000, 200000, 300000, 400000, 500000, 600000, 700000)
downsamples_pbmc <- seq(from = 1000, to = ncol(pbmc), by = 2000)

cells.biccn <- sapply(X = downsamples_biccn, FUN = function(x) {
  sample(x = colnames(x = biccn), replace = FALSE, size = x)
})

cells.pbmc <- sapply(X = downsamples_pbmc, FUN = function(x) {
  sample(x = colnames(x = pbmc), replace = FALSE, size = x)
})

downsample_fragments <- function(fragpath, downsamples, cells, outpath, timepath, project) {
  for (i in seq_along(along.with = downsamples)) {
    ds <- format(x = downsamples[[i]], scientific = FALSE)
    outfile <- paste0(outpath, ds, ".bed.gz")
    frag.dest <- paste0(outpath, ds, ".rds")
    FilterCells(
      fragments = fragpath,
      cells = cells[[i]],
      outfile = outfile,
      verbose = TRUE
    )
    time.start <- Sys.time()
    frags <- CreateFragmentObject(
      path = outfile,
      cells = cells[[i]]
    )
    elapsed <- as.numeric(Sys.time() - time.start, units = "secs")
    saveRDS(object = frags, file = frag.dest, version = 2)
    write(
      x = paste0(elapsed, "\tSignac\t", project, "\t", ds),
      file = timepath,
      append = TRUE
    )
  }
}

downsample_fragments(
  fragpath = "data/biccn/fragments.bed.gz",
  outpath = "/scratch/tim/biccn/downsampling/",
  timepath = "data/biccn/benchmarks/signac_object_creation.tsv",
  project = "BICCN",
  cells = cells.biccn,
  downsamples = downsamples_biccn
)

downsample_fragments(
  fragpath = "data/pbmc_atac/fragments.bed.gz",
  outpath = "/scratch/tim/pbmc_atac/downsampling/",
  timepath = "data/pbmc_atac/benchmarks/signac_object_creation.tsv",
  project = "PBMC",
  cells = cells.pbmc,
  downsamples = downsamples_pbmc
)

R Signac From line 1 of downsampling_code/downsample.R

library(Signac)
library(EnsDb.Mmusculus.v79)
library(EnsDb.Hsapiens.v75)
library(GenomeInfoDb)

# extract gene annotations from EnsDb
annotations.mm <- GetGRangesFromEnsDb(ensdb = EnsDb.Mmusculus.v79)
annotations.hg <- GetGRangesFromEnsDb(ensdb = EnsDb.Hsapiens.v75)

# change to UCSC style since the data was mapped to hg19/mm10
seqlevelsStyle(annotations.hg) <- 'UCSC'
genome(annotations.hg) <- "hg19"

seqlevelsStyle(annotations.mm) <- 'UCSC'
genome(annotations.mm) <- "mm10"

# save
saveRDS(object = annotations.mm, file = "data/biccn/annotations.rds")
saveRDS(object = annotations.hg, file = "data/pbmc_atac/annotations.rds")

R EnsDb.Hsapiens.v75 EnsDb.Mmusculus.v79 Signac GenomeInfoDb From line 1 of downsampling_code/get_annotations.R

library(ggplot2)
library(patchwork)

# load figures
qc_dist <- readRDS("figures/qc_dist.rds")
tss_plot <- readRDS("figures/tss_enrichment.rds")
nucleosome_plot <- readRDS("figures/nucleosome_signal.rds") + ggtitle("Nucleosome signal")
atac_dimplot <- readRDS("figures/pbmc_atac_dimplot.rds")
mp <- readRDS("figures/motifplot.rds")
tf_chromvar <- readRDS("figures/chromvar_vln.rds")
tf_expression <- readRDS("figures/tf_rna_vln.rds")
fp <- readRDS("figures/footprint.rds")
gene_per_link_plot <- readRDS("figures/genes_per_link_plot.rds")
link_per_gene_plot <- readRDS("figures/link_per_gene_plot.rds")
distplot_positive <- readRDS("figures/distance_positive.rds")
distplot_negative <- readRDS("figures/distance_negative.rds")
pval_dist <- readRDS("figures/link_pvals.rds")
label_transfer_accuracy <- readRDS("figures/label_transfer_accuracy.rds")

tf_chromvar <- tf_chromvar & theme(text = element_text(size = 12), axis.text = element_text(size = 12))
tf_expression <- tf_expression & theme(text = element_text(size = 12), axis.text = element_text(size = 12))
lnkplot <- gene_per_link_plot / link_per_gene_plot
distances <- pval_dist / distplot_positive / distplot_negative
qc <- (nucleosome_plot / tss_plot) & ggtitle("")
atac_dimplot <- atac_dimplot + theme(legend.position = "none") +
  xlab("UMAP 1") + ylab("UMAP 2")

qc_violin <- qc_dist[[2]] / qc_dist[[1]] & theme_bw() & theme(legend.position = "none",
                                                              axis.text.x = element_blank(),
                                                              axis.ticks = element_blank())

top.panel <- (qc_violin | qc | atac_dimplot) + plot_layout(widths = c(1, 1, 2.5)) &
  theme(text = element_text(size = 12), axis.text = element_text(size = 12))

panel1 <- (mp / wrap_plots(tf_chromvar) / tf_expression & xlab("")) & theme(text = element_text(size = 12))

ggsave(filename = "figures/figure2.png", plot = top.panel, width = 12, height = 5.5, units = "in", dpi = 400)
ggsave(filename = "figures/figure2_2.png", plot = panel1, width = 6, height = 6, units = "in")
ggsave(filename = "figures/figure2_3.png", plot = fp, width = 4, height = 6, units = 'in')
ggsave(filename = "figures/figure2_4.png", plot = lnkplot, width = 5, height = 6, units = 'in')
ggsave(filename = "figures/figure2_5.png", plot = distances, width = 4, height = 6, units = 'in')

linked_1 <- readRDS("figures/linked_covplot1.rds")
linked_2 <- readRDS("figures/linked_covplot2.rds")

lower.panel <- (linked_1 | linked_2) &
  theme(text = element_text(size = 10), axis.text = element_text(size = 10))

ggsave(filename = "figures/figure2_6.png", plot = lower.panel, width = 16, height = 5, units = "in")

## supplementary figure 1
cp <- readRDS("figures/cellranger_peakcalling.rds")
missed <- readRDS("figures/macs2_pseudobulk.rds")
missed_peak_count <- readRDS("figures/missed_peak_count.rds")
supfig1 <- missed | (missed_peak_count / plot_spacer()) | cp
ggsave(filename = "figures/figs1.png", plot = supfig1, height = 8, width = 15, units = "in")

R ggplot2 patcHwork From line 1 of code/figure2.R

library(ggplot2)
library(patchwork)

dimplot.mito <- readRDS("figures/mito_dimplot.rds") & theme(text = element_text(size = 10))
featplot.mito <- readRDS("figures/mito_allele_plot.rds") & labs(color = "Allele frequency") & theme(text = element_text(size = 10))
featplot.mito <- featplot.mito & theme(legend.title = element_blank())

varplot.mito <- readRDS("figures/mito_varplot.rds") + theme(text = element_text(size = 10))
featplot.mito <- readRDS("figures/mito_allele_plot.rds") & labs(color = "Allele frequency") & theme(text = element_text(size = 10))
heatmap.mito <- readRDS("figures/mito_clone_hm.rds") + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) + theme(text = element_text(size = 10))
covplots <- readRDS("figures/mito_covplot.rds") & theme(text = element_text(size = 10))

covplots <- covplots & theme(axis.text.x = element_text(size=4))
fig4 <- (dimplot.mito | varplot.mito | featplot.mito) / ((heatmap.mito | covplots) + plot_layout(widths = c(1, 2)))

ggsave(filename = "figures/figure4.png", plot = fig4, height = 8, width = 12, units = "in")

R ggplot2 patcHwork From line 1 of code/figure4.R

library(Signac)
library(Seurat)
library(ggplot2)
library(patchwork)
library(paletteer)

biccn <- readRDS("objects/biccn.rds")
pbmc <- readRDS("objects/pbmc_atac.rds")

timings_biccn <- read.table("data/biccn/timings.tsv")
timings_pbmc <- read.table("data/pbmc_atac/timings.tsv")

timings_pbmc$Cores <- as.factor(timings_pbmc$Cores)
timings_biccn$Cores <- as.factor(timings_biccn$Cores)

colors.use <- paletteer_d("ggthemes::Tableau_10")
signac_color <- colors.use[[1]]
archr_color <- colors.use[[9]]

# dimplots
dp <- DimPlot(biccn, group.by = "MajorType", label = TRUE, repel = TRUE, raster = FALSE, pt.size = 0.1) +
  theme_classic() +
  NoLegend() +
  ggtitle(label = "Adult mouse brain", subtitle = "734,000 nuclei")

dp_batch <- DimPlot(biccn, group.by = "orig.ident", label = FALSE, raster = FALSE, pt.size = 0.1, shuffle = TRUE) +
  theme_classic() +
  ggtitle(label = "Adult mouse brain", subtitle = "734,000 nuclei")

pbmc$all <- "PBMC"
dp_pbmc <- DimPlot(pbmc, group.by = "all", label = FALSE) +
  theme_classic() +
  NoLegend() +
  ggtitle(label = "Human PBMCs", subtitle = "26,579 nuclei")

########## PBMC ##########
# object creation
runtime_create <- ggplot(data = timings_pbmc[timings_pbmc$Step == "Create", ], mapping = aes(x = Cells, y = Runtime/60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 25000, 5000)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle("Object creation")

# featurematrix timings
runtime_fmat <- ggplot(data = timings_pbmc[timings_pbmc$Step == "FeatureMatrix", ], mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 25000, 5000)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle(label = "FeatureMatrix", subtitle = "160,906 peaks")

# gene activity
runtime_ga <- ggplot(data = timings_pbmc[timings_pbmc$Step == "GeneActivity", ], mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 25000, 5000)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle(label = "GeneActivity")

# nucleosome signal timings
runtime_ns <- ggplot(data = timings_pbmc[timings_pbmc$Step == "NucleosomeSignal", ], mapping = aes(x = Cells, y = Runtime / 60)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 25000, 5000)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle("NucleosomeSignal")

# QC timings
runtime_qc <- ggplot(
  data = timings_pbmc[timings_pbmc$Step %in% c("NucleosomeSignal", "TSSEnrichment"), ],
  mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Step, scales = "free_y") +
  ylab("Runtime (minutes)") +
  theme_bw() +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 25000, 5000)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle("Signac quality control metrics")

# LSI timings
runtime_lsi <- ggplot(data = timings_pbmc[timings_pbmc$Step == "LSI", ], mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 25000, 5000)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle("LSI")

# total runtime
archr_pbmc_total <- as.numeric(readLines(con = "data/pbmc_atac/archr_total_runtime.txt"))
signac_pbmc_total <- as.numeric(readLines(con = "data/pbmc_atac/signac_total_runtime.txt"))

total_pbmc <- ggplot(data = data.frame(Method = c("Signac", "ArchR"), runtime = c(signac_pbmc_total/60, archr_pbmc_total/60)),
                     mapping = aes(y = runtime, x = Method, fill = Method)) +
  geom_bar(stat = "identity") +
  ylab("Runtime (minutes)") +
  theme_bw() +
  ggtitle(label = "Total runtime", subtitle = "26,579 nuclei; 8 cores") +
  scale_fill_manual(values = c(archr_color, signac_color)) +
  theme(legend.position = "none")

# collate all runtime panels
runtimes_pbmc <- ((runtime_create / runtime_fmat) | (runtime_ga / runtime_qc) | (runtime_lsi / total_pbmc)) + plot_layout(guides = "collect")

######### BICCN ###########
# object creation
runtime_create <- ggplot(data = timings_biccn[timings_biccn$Step == "Create", ], mapping = aes(x = Cells, y = Runtime/60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 700000, 200000)) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  theme(legend.position = "none") +
  ggtitle("Object creation")

# featurematrix timings
runtime_fmat <- ggplot(data = timings_biccn[timings_biccn$Step == "FeatureMatrix", ], mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  ylab("Runtime (minutes)") +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 700000, 200000)) +
  scale_y_continuous(breaks = seq(0, 300, 60)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle(label = "FeatureMatrix", subtitle = "263,815 peaks")

# gene activity
runtime_ga <- ggplot(data = timings_biccn[timings_biccn$Step == "GeneActivity", ], mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  ylab("Runtime (minutes)") +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 700000, 200000)) +
  scale_y_continuous(breaks = seq(0, 300, 60)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle(label = "GeneActivity")

# QC step
runtime_qc <- ggplot(
  data = timings_biccn[timings_biccn$Step %in% c("NucleosomeSignal", "TSSEnrichment"), ],
  mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Step) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 700000, 200000)) +
  ggtitle("Signac quality control metrics")

# LSI timings
runtime_lsi <- ggplot(data = timings_biccn[timings_biccn$Step == "LSI", ], mapping = aes(x = Cells, y = Runtime / 60, color = Cores)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~Method) +
  ylab("Runtime (minutes)") +
  theme_bw() +
  theme(legend.position = "none") +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 700000, 200000)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
  ggtitle("LSI")

# total runtime
archr_biccn_total <- as.numeric(readLines(con = "data/biccn/archr_total_runtime.txt"))
signac_biccn_total <- as.numeric(readLines(con = "data/biccn/signac_total_runtime.txt"))

total_biccn <- ggplot(data = data.frame(Method = c("Signac", "ArchR"), runtime = c(signac_biccn_total/60/60, archr_biccn_total/60/60)),
                     mapping = aes(y = runtime, x = Method, fill = Method)) +
  geom_bar(stat = "identity") +
  ylab("Runtime (hours)") +
  theme_bw() +
  scale_y_continuous(breaks = c(0,1,2,3,4,5)) +
  ggtitle(label = "Total runtime", subtitle = "734,000 nuclei; 8 cores") +
  scale_fill_manual(values = c(archr_color, signac_color)) +
  theme(legend.position = "none")

# collate all runtime panels
runtimes_biccn <- ((runtime_create / runtime_fmat) | (runtime_ga / runtime_qc) | (runtime_lsi / total_biccn)) + plot_layout(guides = "collect")

# save plots
ggsave(filename = "figures/figure5a.png", plot = runtimes_pbmc, height = 6, width = 12, dpi = 500)
ggsave(filename = "figures/figure5b.png", plot = runtimes_biccn, height = 6, width = 12, dpi = 500)
ggsave(filename = "figures/biccn_dimplot_batch.png", plot = dp_batch, height = 12, width = 18, dpi = 300)
ggsave(filename = "figures/biccn_dimplot.png", plot = dp, height = 6, width = 5, dpi = 300)
ggsave(filename = "figures/pbmc_atac_dimplot.png", plot = dp_pbmc, height = 6, width = 5, dpi = 300)

R ggplot2 Seurat patcHwork Signac paletteer From line 1 of code/figure5.R

library(Signac)
library(Seurat)
library(future)
plan(strategy = "multiprocess", workers = 8)
options(future.globals.maxSize = 50 * 1024 ^ 3)

pbmc <- readRDS('objects/pbmc.rds')
DefaultAssay(pbmc) <- "ATAC"

# link peaks to genes
pbmc <- LinkPeaks(
  object = pbmc,
  peak.assay = "ATAC",
  expression.assay = "SCT"
)

saveRDS(object = Links(pbmc), file = "objects/pbmc_links.rds")

R Seurat Signac future From line 1 of code/link_peaks.R

library(Signac)
library(Seurat)

pbmc <- readRDS("objects/pbmc.rds")

# label transfer
# create separate object from the RNA assay
pbmc.rna <- CreateSeuratObject(
  counts = GetAssayData(pbmc, assay = "RNA", slot = "counts"),
  meta.data = pbmc[[]]
)
pbmc.rna <- NormalizeData(pbmc.rna)
pbmc.rna <- FindVariableFeatures(pbmc.rna, nfeatures = 3000)
pbmc.rna <- ScaleData(pbmc.rna)

# Identify anchors
transfer.anchors <- FindTransferAnchors(
  reference = pbmc.rna,
  query = pbmc,
  features = VariableFeatures(object = pbmc.rna), 
  reference.assay = "RNA",
  query.assay = "GA",
  reduction = "cca",
  dims = 1:30
)

pbmc.rna$ct <- pbmc$celltype
pbmc.rna$coarse_celltype <- pbmc$coarse_celltype

celltype.predictions <- TransferData(
  anchorset = transfer.anchors,
  refdata = pbmc.rna$ct,
  weight.reduction = pbmc[["lsi"]],
  dims = 2:30
)

coarse.predictions <- TransferData(
  anchorset = transfer.anchors,
  refdata = pbmc.rna$coarse_celltype,
  weight.reduction = pbmc[["lsi"]],
  dims = 2:30
)

pbmc.atac <- pbmc
pbmc.atac <- AddMetaData(pbmc.atac, metadata = celltype.predictions)
pbmc.atac$coarse_predicted <- coarse.predictions$predicted.id

# remove unneeded assays
pbmc.atac[["RNA"]] <- NULL
pbmc.atac[["SCT"]] <- NULL
pbmc.atac[["GA"]] <- NULL

pbmc.atac$gt <- as.factor(pbmc.rna$ct)
saveRDS(object = pbmc.atac, file = "objects/multimodal_label_transfer.rds")

R Seurat Signac From line 1 of code/multimodal_label_transfer.R

declare -a ncell=("1000" "3000" "5000" "7000" "9000" "11000" "13000" "15000" "17000" "19000" "21000" "23000" "25000")
declare -a ncore=("1" "2" "4" "8")

[ -d data/pbmc_atac/benchmarks ] || mkdir data/pbmc_atac/benchmarks

# feature matrix
for i in "${ncell[@]}"; do
  # need to run cores separately since it doesn't seem to obey the set number of threads
  taskset --cpu-list 1 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      1 \
      archr_pbmc/$i \
      data/pbmc_atac/peaks.bed \
      3 \
      data/pbmc_atac/benchmarks/archr_featmat_runtime_${i}_1.txt \
      "hg19"

  taskset --cpu-list 1,2 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      2 \
      archr_pbmc/$i \
      data/pbmc_atac/peaks.bed \
      3 \
      data/pbmc_atac/benchmarks/archr_featmat_runtime_${i}_2.txt \
      "hg19"

  taskset --cpu-list 1,2,3,4 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      4 \
      archr_pbmc/$i \
      data/pbmc_atac/peaks.bed \
      3 \
      data/pbmc_atac/benchmarks/archr_featmat_runtime_${i}_4.txt \
      "hg19"

  taskset --cpu-list 1,2,3,4,5,6,7,8 Rscript --vanilla code/downsampling_code/run_archr_peakmatrix.R \
      8 \
      archr_pbmc/$i \
      data/pbmc_atac/peaks.bed \
      3 \
      data/pbmc_atac/benchmarks/archr_featmat_runtime_${i}_8.txt \
      "hg19"
done


# gene activity
for i in "${ncell[@]}"; do
  # need to run cores separately since it doesn't seem to obey the set number of threads
  taskset --cpu-list 1 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      1 \
      archr_pbmc/$i \
      3 \
      data/pbmc_atac/benchmarks/archr_geneactivity_runtime_${i}_1.txt \
      "hg19"

  taskset --cpu-list 1,2 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      2 \
      archr_pbmc/$i \
      3 \
      data/pbmc_atac/benchmarks/archr_geneactivity_runtime_${i}_2.txt \
      "hg19"

  taskset --cpu-list 1,2,3,4 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      4 \
      archr_pbmc/$i \
      3 \
      data/pbmc_atac/benchmarks/archr_geneactivity_runtime_${i}_4.txt \
      "hg19"

  taskset --cpu-list 1,2,3,4,5,6,7,8 Rscript --vanilla code/downsampling_code/run_archr_geneactivity.R \
      8 \
      archr_pbmc/$i \
      3 \
      data/pbmc_atac/benchmarks/archr_geneactivity_runtime_${i}_8.txt \
      "hg19"
done

# lsi
for i in "${ncell[@]}"; do
  taskset --cpu-list 1 Rscript --vanilla code/downsampling_code/run_archr_lsi.R \
    archr_pbmc/$i \
    data/pbmc_atac/peaks.bed \
    3 \
    data/pbmc_atac/benchmarks/archr_lsi_runtime_${i}.txt \
    "hg19"
done

# estimated lsi
for i in "${ncell[@]}"; do
  taskset --cpu-list 1 Rscript --vanilla code/downsampling_code/run_archr_estimated_lsi.R \
    archr_pbmc/$i \
    data/pbmc_atac/peaks.bed \
    3 \
    data/pbmc_atac/benchmarks/archr_est_lsi_runtime_${i}.txt \
    "hg19"
done

Shell From line 2 of pbmc_atac_downsampling/benchmark_archr.sh

declare -a ncell=("1000" "3000" "5000" "7000" "9000" "11000" "13000" "15000" "17000" "19000" "21000" "23000" "25000")
declare -a ncore=("1" "2" "4" "8")

[ -d data/pbmc_atac/benchmarks ] || mkdir data/pbmc_atac/benchmarks
[ -d data/pbmc_atac/downsampling ] || mkdir data/pbmc_atac/downsampling

# run each step with different numbers of cores, profile max memory usage

# feature matrix
for i in "${ncell[@]}"; do
  for j in "${ncore[@]}"; do
    /usr/bin/time -o data/pbmc_atac/benchmarks/featmat_mem_${i}_${j}.txt \
      -v Rscript --vanilla code/downsampling_code/run_featurematrix.R \
      $j \
      /scratch/tim/pbmc_atac/downsampling/$i.rds \
      data/pbmc_atac/peaks.bed \
      3 \
      data/pbmc_atac/benchmarks/featmat_runtime_${i}_${j}.txt \
      data/pbmc_atac/downsampling/counts_${i}.rds
  done
done

# nucleosome signal
for i in "${ncell[@]}"; do
  /usr/bin/time -o data/pbmc_atac/benchmarks/nucleosome_mem_${i}.txt \
    -v Rscript --vanilla code/downsampling_code/run_nucleosome.R \
    data/pbmc_atac/downsampling/counts_${i}.rds \
    /scratch/tim/pbmc_atac/downsampling/$i.rds \
    data/pbmc_atac/annotations.rds \
    3 \
    data/pbmc_atac/benchmarks/nucleosome_runtime_${i}.txt \
    data/pbmc_atac/downsampling/nucleosome_${i}.rds
done

# tss enrichment
for i in "${ncell[@]}"; do
  for j in "${ncore[@]}"; do
    /usr/bin/time -o data/biccn/benchmarks/tss_mem_${i}_${j}.txt \
      -v Rscript --vanilla code/downsampling_code/run_tss.R \
      data/pbmc_atac/downsampling/nucleosome_${i}.rds \
      3 \
      data/pbmc_atac/benchmarks/tss_runtime_${i}_${j}.txt \
      data/pbmc_atac/downsampling/tss_${i}.rds \
      $j
  done
done

# gene activity matrix
for i in "${ncell[@]}"; do
  for j in "${ncore[@]}"; do
    /usr/bin/time -o data/pbmc_atac/benchmarks/ga_mem_${i}_${j}.txt \
      -v Rscript --vanilla code/downsampling_code/run_gene_activity.R \
      data/pbmc_atac/downsampling/tss_${i}.rds \
      $j \
      3 \
      data/pbmc_atac/benchmarks/ga_runtime_${i}_${j}.txt
  done
done

# tf-idf
for i in "${ncell[@]}"; do
  /usr/bin/time -o data/pbmc_atac/benchmarks/tfidf_mem_${i}.txt \
    -v Rscript --vanilla code/downsampling_code/run_tfidf.R \
    data/pbmc_atac/downsampling/tss_${i}.rds \
    3 \
    data/pbmc_atac/benchmarks/tfidf_runtime_${i}.txt \
    data/pbmc_atac/downsampling/tfidf_${i}.rds
done

# svd
for i in "${ncell[@]}"; do
  /usr/bin/time -o data/pbmc_atac/benchmarks/svd_mem_${i}.txt \
    -v Rscript --vanilla code/downsampling_code/run_svd.R \
    data/pbmc_atac/downsampling/tfidf_${i}.rds \
    3 \
    data/pbmc_atac/benchmarks/svd_runtime_${i}.txt \
    data/pbmc_atac/downsampling/svd_${i}.rds
done

Shell From line 3 of pbmc_atac_downsampling/benchmark.sh

downsamples <- seq(from = 1000, to = 26000, by = 2000)
cores <- c(1, 2, 4, 8)

results_df <- data.frame()

runtime <- read.table(file = "data/pbmc_atac/benchmarks/signac_object_creation.tsv", sep = "\t")
runtime_archr <- read.table(file = "data/pbmc_atac/benchmarks/archr_object_creation.tsv", sep = "\t")
runtime_archr <- runtime_archr[runtime_archr$V2 == "Arrow", ]
result <- data.frame(
  "Cells" = c(runtime$V4, runtime_archr$V4),
  "Cores" = 1,
  "Step" = "Create",
  "Runtime" = c(runtime$V1, runtime_archr$V1),
  "Method" = c(rep("Signac", nrow(runtime)), rep("ArchR", nrow(runtime_archr)))
)
results_df <- rbind(results_df, result)

for (i in downsamples) {
  for (j in cores) {
    runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/featmat_runtime_", i, "_", j, ".txt"))
    rt_archr <- readLines(con = paste0("data/pbmc_atac/benchmarks/archr_featmat_runtime_", i, "_", j, ".txt"))
    runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
    rt_archr <- sapply(rt_archr, as.numeric, USE.NAMES = FALSE)
    result <- data.frame(
      "Cells" = i,
      "Cores" = j,
      "Step" = "FeatureMatrix",
      "Runtime" = c(runtime, rt_archr),
      "Method" = c(rep("Signac", length(runtime)), rep("ArchR", length(rt_archr)))
    )
    results_df <- rbind(results_df, result)
  }
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/nucleosome_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "NucleosomeSignal",
    "Runtime" = runtime,
    "Method" = "Signac"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  for (j in cores) {
    runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/ga_runtime_", i, "_", j, ".txt"))
    rt_archr <- readLines(con = paste0("data/pbmc_atac/benchmarks/archr_geneactivity_runtime_", i, "_", j, ".txt"))
    runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
    rt_archr <- sapply(rt_archr, as.numeric, USE.NAMES = FALSE)
    result <- data.frame(
      "Cells" = i,
      "Cores" = j,
      "Step" = "GeneActivity",
      "Runtime" = c(runtime, rt_archr),
      "Method" = c(rep("Signac", length(runtime)), rep("ArchR", length(rt_archr)))
    )
    results_df <- rbind(results_df, result)
  }
}

for (i in downsamples) {
  for (j in cores) {
    runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/tss_runtime_", i, "_", j, ".txt"))
    runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
    result <- data.frame(
      "Cells" = i,
      "Cores" = j,
      "Step" = "TSSEnrichment",
      "Runtime" = runtime,
      "Method" = "Signac"
    )
    results_df <- rbind(results_df, result)
  }
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/tfidf_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "RunTFIDF",
    "Runtime" = runtime,
    "Method" = "Signac"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/svd_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "RunSVD",
    "Runtime" = runtime,
    "Method" = "Signac"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/archr_lsi_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "LSI",
    "Runtime" = runtime,
    "Method" = "ArchR"
  )
  results_df <- rbind(results_df, result)
}

for (i in downsamples) {
  runtime <- readLines(con = paste0("data/pbmc_atac/benchmarks/archr_est_lsi_runtime_", i, ".txt"))
  runtime <- sapply(runtime, as.numeric, USE.NAMES = FALSE)
  result <- data.frame(
    "Cells" = i,
    "Cores" = 1,
    "Step" = "estLSI",
    "Runtime" = runtime,
    "Method" = "ArchR"
  )
  results_df <- rbind(results_df, result)
}

# add LSI
tfidf <- results_df[results_df$Step == "RunTFIDF", ]
runsvd <- results_df[results_df$Step == "RunSVD", ]
lsi <- runsvd
lsi$Runtime <- runsvd$Runtime + tfidf$Runtime
lsi$Step <- "LSI"
results_df <- rbind(results_df, lsi)

results_df$Cells <- as.numeric(results_df$Cells)
results_df$Cores <- as.factor(results_df$Cores)

write.table(x = results_df, file = "data/pbmc_atac/timings.tsv")

R From line 1 of pbmc_atac_downsampling/collate_timings.R

library(Signac)
library(Seurat)
library(RANN)
library(ggplot2)
library(cluster)
library(dplyr)
library(patchwork)
library(paletteer)
library(SeuratDisk)

set.seed(1234)

pbmc <- readRDS("objects/pbmc.rds")
atac.assay <- "ATAC"
methods_keep <- c("LSI (Cusanovich2018)", "SnapATAC", "cisTopic CGS", "cisTopic Warp", "SCALE", "LSI (log-TF)", "LSI (Signac)")
colors.use <- paletteer_d("ggthemes::Tableau_10")
colors.use <- rev(colors.use[1:length(methods_keep)])

######## Data loading #########

read_lsi <- function(method, path ="data/pbmc/downsamples/") {
  methodstr <- paste0(as.character(method), ".rds")
  lsi <- paste0(path, c(
    paste0("lsi_1_", methodstr),
    paste0("lsi_0.8_", methodstr),
    paste0("lsi_0.6_", methodstr),
    paste0("lsi_0.4_", methodstr),
    paste0("lsi_0.2_", methodstr)
  ))
  lsi_obj <- lapply(X = lsi, readRDS)
  return(lsi_obj)
}

lsi_1 <- read_lsi(method = 1)
lsi_2 <- read_lsi(method = 2)
lsi_3 <- read_lsi(method = 3)
lsi_4 <- read_lsi(method = 4)

pbmc_ds <- c(1, 0.8, 0.6, 0.4, 0.2)

snap <- lapply(X = paste0("data/pbmc/downsamples/snapatac_", pbmc_ds, ".rds"), FUN = readRDS)
ct_cgs <- lapply(X = paste0("data/pbmc/downsamples/cistopic_cgs_", pbmc_ds, ".rds"), FUN = readRDS)
ct_warp <- lapply(X = paste0("data/pbmc/downsamples/cistopic_warp_", pbmc_ds, ".rds"), FUN = readRDS)

ct_cgs <- lapply(ct_cgs, t)
ct_warp <- lapply(ct_warp, t)

# convert h5ad to h5seurat
scale_pbmc_path <- lapply(
  X = pbmc_ds,
  function(x) {
    Convert(
      source = paste0("data/pbmc/downsamples/scale_", x, "/adata.h5ad"),
      dest = paste0("data/pbmc/downsamples/scale_", x, "/adata.h5seurat"),
      overwrite = TRUE
    )
  }
)

# load h5seurat
scale_pbmc_obj <- lapply(X = scale_pbmc_path, FUN = LoadH5Seurat)

# get embeddings
scale_pbmc <- lapply(X = scale_pbmc_obj, FUN = function(x) {
  Embeddings(x[["latent"]])
})

######## Determine dimensions to use ##########

seqdepth_pbmc <- pbmc$nCount_ATAC

# dim 1
lsi1_depth <- lapply(lsi_1, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})
# no dims
lsi2_depth <- lapply(lsi_2, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})
# no dims
lsi3_depth <- lapply(lsi_3, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})
# no dims
lsi4_depth <- lapply(lsi_4, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})
# no dims
cgs_depth <- lapply(ct_cgs, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})
# no dims
warp_depth <- lapply(ct_warp, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})
# dim 2
snap_depth <- lapply(snap, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})
# no dims
scale_depth <- lapply(scale_pbmc, function(x) {
  which(abs(cor(x, seqdepth_pbmc)) > 0.9)
})

######## KNN #########

# use cell types defined by RNA
clustering.use <- "celltype"
clusters <- pbmc[[clustering.use]][[1]]

# first define neighbor graph using the RNA assay
k <- 100
rna.emb <- Embeddings(pbmc[["pca"]])
rna.nn <- nn2(data = rna.emb, k = k + 1)$nn.idx[, 2:k]

knn_purity <- function(embeddings, dims, clusters, rna.nn, k = 100) {
  nn <- nn2(data = embeddings[, dims], k = k + 1)$nn.idx[, 2:k]
  nn_purity <- vector(mode = "numeric", length = length(x = clusters))
  for (i in seq_len(length.out = nrow(x = nn))) {
    nn_purity[i] <- sum(clusters[nn[i, ]] == clusters[i]) / k
  }
  return(nn_purity)
}

get_knn_df <- function(emb_list, dims, clusters, rna_nn, method, ds_list, k) {
  # compute KNN purity for each dimension reduction
  knn_df <- data.frame()
  for (i in seq_along(along.with = emb_list)) {
    knn <- knn_purity(embeddings = emb_list[[i]], dims = dims, clusters = clusters, rna.nn = rna_nn, k = k)
    ds <- ds_list[[i]]
    kd <- data.frame(
      purity = knn,
      downsample = ds, method = method,
      celltype = clusters
    )
    knn_df <- rbind(knn_df, kd)
  }
  return(knn_df)
}

k <- 100
knn_lsi1 <- get_knn_df(
  emb_list = lsi_1,
  dims = 2:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Signac)",
  k = k
)

knn_lsi2 <- get_knn_df(
  emb_list = lsi_2,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cusanovich2018)",
  k = k
)

knn_lsi3 <- get_knn_df(
  emb_list = lsi_3,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (log-TF)",
  k = k
)

knn_lsi4 <- get_knn_df(
  emb_list = lsi_4,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cellranger)",
  k = k
)

knn_snap <- get_knn_df(
  emb_list = snap,
  dims = c(1, 3:20),
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SnapATAC",
  k = k
)

knn_ct_cgs <- get_knn_df(
  emb_list = ct_cgs,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic CGS",
  k = k
)

knn_ct_warp <- get_knn_df(
  emb_list = ct_warp,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic Warp",
  k = k
)

knn_scale <- get_knn_df(
  emb_list = scale_pbmc,
  dims = 1:10,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SCALE",
  k = k
)

knn_df <- rbind(knn_lsi1, knn_lsi2, knn_lsi3, knn_lsi4, knn_snap, knn_ct_cgs, knn_ct_warp, knn_scale)
knn_df$downsample <- factor(knn_df$downsample, levels = rev(pbmc_ds))

knn_plot <- knn_df[knn_df$method %in% methods_keep, ]
knn_plot$method <- factor(knn_plot$method, levels = methods_keep)

knn_plot<- knn_plot %>% 
  group_by(celltype, method, downsample) %>% 
  mutate(mn = mean(purity)) %>% 
  ungroup()

knn_plot <- knn_plot[, c("celltype", "method", "downsample", "mn")]
knn_plot <- unique(knn_plot)

p2 <- ggplot(knn_plot, aes(x = downsample, y = mn, fill = method)) +
  geom_boxplot(outlier.size = 0.1) +
  theme_bw() +
  ylab("Mean kNN celltype purity") +
  xlab("Fraction of counts retained") +
  scale_fill_manual(values = colors.use)

# test choice of K
knn_df$k <- k

k <- 150
knn_lsi1 <- get_knn_df(
  emb_list = lsi_1,
  dims = 2:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Signac)",
  k = k
)

knn_lsi2 <- get_knn_df(
  emb_list = lsi_2,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cusanovich2018)",
  k = k
)

knn_lsi3 <- get_knn_df(
  emb_list = lsi_3,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (log-TF)",
  k = k
)

knn_lsi4 <- get_knn_df(
  emb_list = lsi_4,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cellranger)",
  k = k
)

knn_snap <- get_knn_df(
  emb_list = snap,
  dims = c(1, 3:20),
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SnapATAC",
  k = k
)

knn_ct_cgs <- get_knn_df(
  emb_list = ct_cgs,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic CGS",
  k = k
)

knn_ct_warp <- get_knn_df(
  emb_list = ct_warp,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic Warp",
  k = k
)

knn_scale <- get_knn_df(
  emb_list = scale_pbmc,
  dims = 1:10,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SCALE",
  k = k
)

knn_df2 <- rbind(knn_lsi1, knn_lsi2, knn_lsi3, knn_lsi4, knn_snap, knn_ct_cgs, knn_ct_warp, knn_scale)
knn_df2$downsample <- factor(knn_df2$downsample, levels = rev(pbmc_ds))
knn_df2$k <- k

knn_df <- rbind(knn_df, knn_df2)

k <- 50
knn_lsi1 <- get_knn_df(
  emb_list = lsi_1,
  dims = 2:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Signac)",
  k = k
)

knn_lsi2 <- get_knn_df(
  emb_list = lsi_2,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cusanovich2018)",
  k = k
)

knn_lsi3 <- get_knn_df(
  emb_list = lsi_3,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (log-TF)",
  k = k
)

knn_lsi4 <- get_knn_df(
  emb_list = lsi_4,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cellranger)",
  k = k
)

knn_snap <- get_knn_df(
  emb_list = snap,
  dims = c(1, 3:20),
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SnapATAC",
  k = k
)

knn_ct_cgs <- get_knn_df(
  emb_list = ct_cgs,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic CGS",
  k = k
)

knn_ct_warp <- get_knn_df(
  emb_list = ct_warp,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic Warp",
  k = k
)

knn_scale <- get_knn_df(
  emb_list = scale_pbmc,
  dims = 1:10,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SCALE",
  k = k
)

knn_df2 <- rbind(knn_lsi1, knn_lsi2, knn_lsi3, knn_lsi4, knn_snap, knn_ct_cgs, knn_ct_warp, knn_scale)
knn_df2$downsample <- factor(knn_df2$downsample, levels = rev(pbmc_ds))
knn_df2$k <- k

knn_df <- rbind(knn_df, knn_df2)

k <- 10
knn_lsi1 <- get_knn_df(
  emb_list = lsi_1,
  dims = 2:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Signac)",
  k = k
)

knn_lsi2 <- get_knn_df(
  emb_list = lsi_2,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cusanovich2018)",
  k = k
)

knn_lsi3 <- get_knn_df(
  emb_list = lsi_3,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (log-TF)",
  k = k
)

knn_lsi4 <- get_knn_df(
  emb_list = lsi_4,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "LSI (Cellranger)",
  k = k
)

knn_snap <- get_knn_df(
  emb_list = snap,
  dims = c(1, 3:20),
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SnapATAC",
  k = k
)

knn_ct_cgs <- get_knn_df(
  emb_list = ct_cgs,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic CGS",
  k = k
)

knn_ct_warp <- get_knn_df(
  emb_list = ct_warp,
  dims = 1:20,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "cisTopic Warp",
  k = k
)

knn_scale <- get_knn_df(
  emb_list = scale_pbmc,
  dims = 1:10,
  ds_list = pbmc_ds,
  clusters = clusters,
  rna_nn = rna.nn,
  method = "SCALE",
  k = k
)

knn_df2 <- rbind(knn_lsi1, knn_lsi2, knn_lsi3, knn_lsi4, knn_snap, knn_ct_cgs, knn_ct_warp, knn_scale)
knn_df2$downsample <- factor(knn_df2$downsample, levels = rev(pbmc_ds))
knn_df2$k <- k

knn_df <- rbind(knn_df, knn_df2)

knn_df$k <- paste0("k=", as.character(knn_df$k))
knn_df$k <- factor(knn_df$k, levels = c("k=10", "k=50", "k=100", "k=150"))

knn_plot <- knn_df[knn_df$method %in% methods_keep, ]
knn_plot$method <- factor(knn_plot$method, levels = methods_keep)

knn_plot<- knn_plot %>% 
  group_by(celltype, method, downsample, k) %>% 
  mutate(mn = mean(purity)) %>% 
  ungroup()

knn_plot <- knn_plot[, c("celltype", "method", "downsample", "mn", "k")]
knn_plot <- unique(knn_plot)

knn_sensitivity <- ggplot(knn_plot, aes(x = downsample, y = mn, fill = method)) +
  geom_boxplot(outlier.size = 0.1) +
  theme_bw() +
  facet_wrap(~k, ncol = 1) + 
  ylab("Mean kNN celltype purity") +
  xlab("Fraction of counts retained") +
  scale_fill_manual(values = colors.use)

ggsave(filename = "figures/knn_sensitivity.png", plot = knn_sensitivity, height = 8, width = 10, dpi = 300)

###### UMAP ########

umaps_lsi_1 <- lapply(lsi_1, function(x) RunUMAP(x[, 2:20]))
umaps_lsi_2 <- lapply(lsi_2, function(x) RunUMAP(x[, 1:20]))
umaps_lsi_3 <- lapply(lsi_3, function(x) RunUMAP(x[, 1:20]))
umaps_lsi_4 <- lapply(lsi_4, function(x) RunUMAP(x[, 1:20]))
umaps_snap <- lapply(snap, function(x) {
  rownames(x) <- colnames(pbmc)
  dr <- RunUMAP(x[, c(1, 3:20)])
  dr
  }
)
umaps_ct_cgs <- lapply(ct_cgs, function(x) {
  rownames(x) <- colnames(pbmc)
  dr <- RunUMAP(x[, 1:20])
  dr
  }
)
umaps_ct_warp <- lapply(ct_warp, function(x) {
  rownames(x) <- colnames(pbmc)
  dr <- RunUMAP(x[, 1:20])
  dr
}
)
umaps_scale <- lapply(scale_pbmc, function(x) RunUMAP(x[, 1:10]))

######### Runtimes ##########

runtime_lsi <- read.table("data/pbmc/downsamples/lsi_runtime.txt", sep = "\t")
colnames(runtime_lsi) <- c("Seconds", "Downsample", "Method")
runtime_lsi[runtime_lsi$Method == 1, "Method"] <- "LSI (Signac)"
runtime_lsi[runtime_lsi$Method == 2, "Method"] <- "LSI (Cusanovich2018)"
runtime_lsi[runtime_lsi$Method == 3, "Method"] <- "LSI (log-TF)"
runtime_lsi[runtime_lsi$Method == 4, "Method"] <- "LSI (Cellranger)"

runtime_snap <- read.table("data/pbmc/downsamples/snapatac_runtime.txt", sep = "\t")
colnames(runtime_snap) <- c("Seconds", "Downsample")
runtime_snap$Method <- "SnapATAC"

runtime_cistopic_cg <- read.table("data/pbmc/downsamples/cistopic_cgs_runtime.txt", sep = "\t")
colnames(runtime_cistopic_cg) <- c("Seconds", "Downsample")
runtime_cistopic_cg$Method <- "cisTopic CGS"

runtime_cistopic_warp <- read.table("data/pbmc/downsamples/cistopic_warp_runtime.txt", sep = "\t")
colnames(runtime_cistopic_warp) <- c("Seconds", "Downsample")
runtime_cistopic_warp$Method <- "cisTopic Warp"

runtime_scale <- read.table("data/pbmc/downsamples/scale_runtime.txt", sep = "\t")
colnames(runtime_scale) <- c("Seconds", "Downsample")
runtime_scale$Method <- "SCALE"

runtimes <- rbind(runtime_cistopic_cg, runtime_cistopic_warp, runtime_lsi, runtime_snap, runtime_scale)
runtimes <- runtimes[runtimes$Method %in% methods_keep, ]
runtimes$Method <- factor(runtimes$Method, levels = methods_keep)

###### Silhouette ######

# # use cell types defined by RNA (executed above)
clustering.use <- "celltype"
clusters <- pbmc[[clustering.use]][[1]]

get_silhouette <- function(embeddings.list, dims, clusters, method, ds) {
  df <- data.frame()
  for (i in seq_along(along.with = embeddings.list)) {
    dist.matrix <- dist(x = embeddings.list[[i]][, dims])
    sil <- silhouette(x = as.numeric(x = as.factor(x = clusters)), dist = dist.matrix)
    res <- data.frame(
      "celltype" = clusters,
      "silhouette" = sil[, 3],
      "method" = method,
      "downsample" = ds[[i]]
    )
    df <- rbind(df, res)
  }
  return(df)
}

lsi1_sil <- get_silhouette(lsi_1, 2:20, clusters, "LSI (Signac)", pbmc_ds)
lsi2_sil <- get_silhouette(lsi_2, 1:20, clusters, "LSI (Cusanovich2018)", pbmc_ds)
lsi3_sil <- get_silhouette(lsi_3, 1:20, clusters, "LSI (log-TF)", pbmc_ds)
lsi4_sil <- get_silhouette(lsi_4, 1:20, clusters, "LSI (Cellranger)", pbmc_ds)
snap_sil <- get_silhouette(snap, c(1, 3:20), clusters, "SnapATAC", pbmc_ds)
cgs_sil <- get_silhouette(ct_cgs, 1:20, clusters, "cisTopic CGS", pbmc_ds)
warp_sil <- get_silhouette(ct_warp, 1:20, clusters, "cisTopic Warp", pbmc_ds)
scale_sil <- get_silhouette(scale_pbmc, 1:10, clusters, "SCALE", pbmc_ds)

sil_pbmc <- rbind(lsi1_sil, lsi2_sil, lsi3_sil, lsi4_sil,
                  snap_sil, cgs_sil, warp_sil, scale_sil)

sil_pbmc$downsample <- factor(sil_pbmc$downsample)

sil_pbmc_plot <- sil_pbmc[sil_pbmc$method %in% methods_keep, ]
sil_pbmc_plot$method <- factor(sil_pbmc_plot$method, levels =  methods_keep)

sil_pbmc_plot<- sil_pbmc_plot %>% 
  group_by(celltype, method, downsample) %>% 
  mutate(mn = mean(silhouette)) %>% 
  ungroup()

sil_pbmc_plot <- sil_pbmc_plot[, c("celltype", "method", "downsample", "mn")]
sil_pbmc_plot <- unique(sil_pbmc_plot)

sil_plot <- ggplot(sil_pbmc_plot, aes(x = downsample, y = mn, fill = method)) +
  geom_boxplot(outlier.size = 0.1) +
  scale_fill_manual(values = colors.use) +
  xlab("Fraction of counts retained") +
  ylab("Mean Silhouette") +
  theme_bw()

###### Figure ######

create_plot <- function(dr, object) {
  object[['dr']] <- dr
  p <- DimPlot(object, reduction = "dr", group.by = "celltype", pt.size = 0.1) +
    ggtitle("") + ylab("") + xlab("") +
    guides(color = guide_legend(ncol = 1, override.aes = list(size = 2)))
}

# umaps
umap.use <- c(1, 5)
lsi_plot <- lapply(umaps_lsi_1[umap.use], create_plot, object = pbmc)
lsi2_plot <- lapply(umaps_lsi_2[umap.use], create_plot, object = pbmc)
scale_plot <- lapply(umaps_scale[umap.use], create_plot, object = pbmc)
snap_plot <- lapply(umaps_snap[umap.use], create_plot, object = pbmc)

lsi_plot[[1]] <- lsi_plot[[1]] + ylab("Full dataset") + ggtitle("LSI (Signac)")
lsi_plot[[2]] <- lsi_plot[[2]] + ylab("20% counts")
scale_plot[[1]] <- scale_plot[[1]] + ggtitle("SCALE")
snap_plot[[1]] <- snap_plot[[1]] + ggtitle("SnapATAC")

umaps <- wrap_plots(
  list(lsi_plot[[1]], scale_plot[[1]], snap_plot[[1]],
    lsi_plot[[2]], scale_plot[[2]], snap_plot[[2]]),
  ncol = 3,
  guides = "collect"
)

bs <- 16

p3 <- ggplot(runtimes[runtimes$Downsample == 1, ], aes(y = Seconds/60, x = Method, fill = Method)) +
  geom_bar(stat = "identity") +
  scale_y_log10() +
  ggtitle("Total run time") +
  theme_bw(base_size = bs) +
  scale_fill_manual(values = colors.use) +
  ylab("Time (minutes)") +
  xlab("") +
  theme(legend.position = 'none', axis.text.x = element_text(size = 8, angle = 25, vjust = 1, hjust=1))

sil_plot <- sil_plot + theme(legend.position = "none") + theme_bw(base_size = bs)
umaps <- umaps & theme_bw(base_size = bs)
p2 <- p2 + theme_bw(base_size = bs)

fig <- (umaps | p3) + plot_layout(widths = c(3, 1))
metrics <- (p2 / sil_plot) + plot_layout(guides = "collect")
pbmc_fig <- (fig / metrics) & theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))
pbmc_fig + ggsave(filename = "figures/dimreduc_pbmc.png", height = 12, width = 16)

######## Chen #########

chen_levels <- c(250, 500, 1000, 2500, 5000)

read_lsi <- function(method, path ="data/chen/embeddings/", levels = chen_levels) {
  methodstr <- paste0(as.character(method), ".rds")
  lsi <- paste0(path, "lsi_", chen_levels, "_", methodstr)
  lsi_obj <- lapply(X = lsi, readRDS)
  return(lsi_obj)
}

lsi_chen_1 <- read_lsi(method = 1)
lsi_chen_2 <- read_lsi(method = 2)
lsi_chen_3 <- read_lsi(method = 3)
lsi_chen_4 <- read_lsi(method = 4)

snap_chen <- lapply(X = paste0("data/chen/embeddings/snapatac_", chen_levels, ".rds"), FUN = readRDS)
ct_cgs_chen <- lapply(X = paste0("data/chen/embeddings/cistopic_cgs_", chen_levels, ".rds"), FUN = readRDS)
ct_warp_chen <- lapply(X = paste0("data/chen/embeddings/cistopic_warp_", chen_levels, ".rds"), FUN = readRDS)

ct_cgs_chen <- lapply(ct_cgs_chen, t)
ct_warp_chen <- lapply(ct_warp_chen, t)

# convert h5ad to h5seurat
chen_scale_paths <- lapply(
  X = chen_levels,
  function(x) {
    Convert(
      source = paste0("data/chen/embeddings/scale_", x, "/adata.h5ad"),
      dest = paste0("data/chen/embeddings/scale_", x, "/adata.h5seurat"),
      overwrite = TRUE
    )
  }
)

# load h5seurat
scale_chen_obj <- lapply(X = chen_scale_paths, FUN = LoadH5Seurat)

# extract embeddings
scale_chen <- lapply(X = scale_chen_obj, FUN = function(x) {
  Embeddings(x[["latent"]])
})

counts <- readRDS("data/chen/scATAC-benchmarking-master/Synthetic_Data/BoneMarrow_cov5000/input/bonemarrow_cov5000.rds")
chen_obj <- CreateSeuratObject(counts = counts)
chen_obj$celltype <- chen_obj$orig.ident

######## Determine dimensions to use ##########

seqdepth_chen <- chen_obj$nCount_RNA

lsi1_depth <- lapply(lsi_chen_1, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})
lsi2_depth <- lapply(lsi_chen_2, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})
lsi3_depth <- lapply(lsi_chen_3, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})
lsi4_depth <- lapply(lsi_chen_4, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})
cgs_depth <- lapply(ct_cgs_chen, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})
warp_depth <- lapply(ct_warp_chen, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})
snap_depth <- lapply(snap_chen, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})
scale_depth <- lapply(scale_chen, function(x) {
  which(abs(cor(x, seqdepth_chen)) > 0.9)
})

##### neighbors #####

get_knn_df <- function(emb_list, dims, clusters, rna_nn, method, ds_list) {
  # compute KNN purity for each dimension reduction
  knn_df <- data.frame()
  for (i in seq_along(along.with = emb_list)) {
    knn <- knn_purity(embeddings = emb_list[[i]], dims = dims, clusters = clusters, rna.nn = rna_nn)
    ds <- ds_list[[i]]
    kd <- data.frame(purity = knn, downsample = ds, method = method, celltype = clusters)
    knn_df <- rbind(knn_df, kd)
  }
  return(knn_df)
}

clusters <- unlist(lapply(strsplit(x = rownames(lsi_chen_1[[1]]), split = "_"), FUN = `[[`, 1))

knn_lsi1_chen <- get_knn_df(
  emb_list = lsi_chen_1,
  dims = 1:5,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "LSI (Signac)"
)

knn_lsi2_chen <- get_knn_df(
  emb_list = lsi_chen_2,
  dims = 1:5,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "LSI (Cusanovich2018)"
)

knn_lsi3_chen <- get_knn_df(
  emb_list = lsi_chen_3,
  dims = 1:5,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "LSI (log-TF)"
)

knn_lsi4_chen <- get_knn_df(
  emb_list = lsi_chen_4,
  dims = 1:5,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "LSI (Cellranger)"
)

knn_ct_cgs_chen <- get_knn_df(
  emb_list = ct_cgs_chen,
  dims = 1:5,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "cisTopic CGS"
)

knn_ct_warp_chen <- get_knn_df(
  emb_list = ct_warp_chen,
  dims = 1:5,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "cisTopic Warp"
)

knn_snap_chen <- get_knn_df(
  emb_list = snap_chen,
  dims = 1:5,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "SnapATAC"
)

knn_scale_chen <- get_knn_df(
  emb_list = scale_chen,
  dims = 1:10,
  ds_list = chen_levels,
  clusters = clusters,
  rna_nn = NULL,
  method = "SCALE"
)

knn_df_chen <- rbind(knn_lsi1_chen, knn_lsi2_chen, knn_lsi3_chen, knn_lsi4_chen,
                     knn_ct_cgs_chen, knn_ct_warp_chen, knn_snap_chen, knn_scale_chen)

knn_df_chen_plot <- knn_df_chen[knn_df_chen$method %in% methods_keep, ]
knn_df_chen_plot$method <- factor(knn_df_chen_plot$method, levels =  methods_keep)

knn_df_chen_plot <- knn_df_chen_plot %>% 
  group_by(downsample, method, celltype) %>% 
  mutate(mn = mean(purity)) %>% 
  ungroup()

knn_df_chen_plot <- knn_df_chen_plot[, c("mn", "method", "downsample")]
knn_df_chen_plot <- unique(knn_df_chen_plot)

# compute UMAP for each
umaps_lsi_1_chen <- lapply(lsi_chen_1, function(x) RunUMAP(x[, 1:5]))
umaps_lsi_2_chen <- lapply(lsi_chen_2, function(x) RunUMAP(x[, 1:5]))
umaps_lsi_3_chen <- lapply(lsi_chen_3, function(x) RunUMAP(x[, 1:5]))
umaps_lsi_4_chen <- lapply(lsi_chen_4, function(x) RunUMAP(x[, 1:5]))
umaps_snap_chen <- lapply(snap_chen, function(x) {
  rownames(x) <- colnames(chen_obj)
  dr <- RunUMAP(x[, 1:5])
  dr
}
)
umaps_ct_cgs_chen<- lapply(ct_cgs_chen, function(x) {
  rownames(x) <- colnames(chen_obj)
  dr <- RunUMAP(x[, 1:5])
  dr
}
)
umaps_ct_warp_chen <- lapply(ct_warp_chen, function(x) {
  rownames(x) <- colnames(chen_obj)
  dr <- RunUMAP(x[, 1:5])
  dr
}
)
umaps_scale_chen <- lapply(scale_chen, function(x) RunUMAP(x[, 1:10]))

## Silhouette ##

lsi1_sil_chen <- get_silhouette(lsi_chen_1, 1:5, clusters, "LSI (Signac)", chen_levels)
lsi2_sil_chen <- get_silhouette(lsi_chen_2, 1:5, clusters, "LSI (Cusanovich2018)", chen_levels)
lsi3_sil_chen <- get_silhouette(lsi_chen_3, 1:5, clusters, "LSI (log-TF)", chen_levels)
lsi4_sil_chen <- get_silhouette(lsi_chen_4, 1:5, clusters, "LSI (Cellranger)", chen_levels)
snap_sil_chen <- get_silhouette(snap_chen, 1:5, clusters, "SnapATAC", chen_levels)
cgs_sil_chen <- get_silhouette(ct_cgs_chen, 1:5, clusters, "cisTopic CGS", chen_levels)
warp_sil_chen <- get_silhouette(ct_warp_chen, 1:5, clusters, "cisTopic Warp", chen_levels)
scale_sil_chen <- get_silhouette(scale_chen, 1:10, clusters, "SCALE", chen_levels)

sil_chen <- rbind(lsi1_sil_chen, lsi2_sil_chen, lsi3_sil_chen, lsi4_sil_chen,
                  snap_sil_chen, cgs_sil_chen, warp_sil_chen, scale_sil_chen)

sil_chen$downsample <- factor(sil_chen$downsample)

sil_chen_plot <- sil_chen[sil_chen$method %in% methods_keep, ]
sil_chen_plot$method <- factor(sil_chen_plot$method, levels =  methods_keep)

sil_chen_plot <- sil_chen_plot %>% 
  group_by(downsample, method, celltype) %>% 
  mutate(mn = mean(silhouette)) %>% 
  ungroup()
sil_chen_plot <- sil_chen_plot[, c("downsample", "method", "mn")]
sil_chen_plot <- unique(sil_chen_plot)

sil_chen <- ggplot(sil_chen_plot, aes(x = downsample, y = mn, fill = method)) +
  geom_boxplot(outlier.size = 0.1) +
  scale_fill_manual(values = colors.use) +
  xlab("Average counts per cell") +
  ylab("Mean Silhouette") +
  theme_bw()

######### Runtimes ##########

runtime_lsi <- read.table("data/chen/embeddings/lsi_runtime.txt", sep = "\t")
colnames(runtime_lsi) <- c("Seconds", "Downsample", "Method")
runtime_lsi[runtime_lsi$Method == 1, "Method"] <- "LSI (Signac)"
runtime_lsi[runtime_lsi$Method == 2, "Method"] <- "LSI (Cusanovich2018)"
runtime_lsi[runtime_lsi$Method == 3, "Method"] <- "LSI (log-TF)"
runtime_lsi[runtime_lsi$Method == 4, "Method"] <- "LSI (Cellranger)"

runtime_snap <- read.table("data/chen/embeddings/snapatac_runtime.txt", sep = "\t")
colnames(runtime_snap) <- c("Seconds", "Downsample")
runtime_snap$Method <- "SnapATAC"

runtime_cistopic_cg <- read.table("data/chen/embeddings/cistopic_cgs_runtime.txt", sep = "\t")
colnames(runtime_cistopic_cg) <- c("Seconds", "Downsample")
runtime_cistopic_cg$Method <- "cisTopic CGS"

runtime_cistopic_warp <- read.table("data/chen/embeddings/cistopic_warp_runtime.txt", sep = "\t")
colnames(runtime_cistopic_warp) <- c("Seconds", "Downsample")
runtime_cistopic_warp$Method <- "cisTopic Warp"

runtime_scale <- read.table("data/chen/embeddings/scale_runtime.txt", sep = "\t")
colnames(runtime_scale) <- c("Seconds", "Downsample")
runtime_scale$Method <- "SCALE"

runtimes <- rbind(runtime_cistopic_cg, runtime_cistopic_warp, runtime_lsi, runtime_snap, runtime_scale)
runtimes <- runtimes[runtimes$Method %in% methods_keep, ]
runtimes$Method <- factor(runtimes$Method, levels = methods_keep)

chen_runtimes <- ggplot(runtimes[runtimes$Downsample == 5000, ], aes(y = Seconds, x = Method, fill = Method)) +
  geom_bar(stat = "identity") +
  scale_y_log10() +
  theme_bw() +
  scale_fill_manual(values = colors.use) +
  ylab("Time (seconds)") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

#### Figure #####

# create figure with ARI and UMAPs for chen dataset (supplementary figure)

lsi_plot <- lapply(umaps_lsi_1_chen, create_plot, object = chen_obj)
lsi2_plot <- lapply(umaps_lsi_2_chen, create_plot, object = chen_obj)
lsi3_plot <- lapply(umaps_lsi_3_chen, create_plot, object = chen_obj)
scale_plot <- lapply(umaps_scale_chen, create_plot, object = chen_obj)
cgs_plot <- lapply(umaps_ct_cgs_chen, create_plot, object = chen_obj)
warp_plot <- lapply(umaps_ct_warp_chen, create_plot, object = chen_obj)
snap_plot <- lapply(umaps_snap_chen, create_plot, object = chen_obj)

# set axis grid names
lsi_plot[[1]] <- lsi_plot[[1]] + ylab("LSI (Signac)") + ggtitle("250")
lsi_plot[[2]] <- lsi_plot[[2]] + ggtitle("500")
lsi_plot[[3]] <- lsi_plot[[3]] + ggtitle("1000")
lsi_plot[[4]] <- lsi_plot[[4]] + ggtitle("2500")
lsi_plot[[5]] <- lsi_plot[[5]] + ggtitle("5000")

lsi2_plot[[1]] <- lsi2_plot[[1]] + ylab("LSI (Cusanovich2018)")
lsi3_plot[[1]] <- lsi3_plot[[1]] + ylab("LSI (log-TF)")
scale_plot[[1]] <- scale_plot[[1]] + ylab("SCALE")
cgs_plot[[1]] <- cgs_plot[[1]] + ylab("cisTopic CGS")
warp_plot[[1]] <- warp_plot[[1]] + ylab("cisTopic Warp")
snap_plot[[1]] <- snap_plot[[1]] + ylab("SnapATAC")

p1 <- wrap_plots(
  c(lsi_plot, lsi2_plot, lsi3_plot, scale_plot, cgs_plot, warp_plot, snap_plot),
  nrow = 7,
  guides = "collect"
)

p1 <- p1 & theme(plot.margin =  unit(c(1,1,1,1), "mm"))

p2 <- ggplot(knn_df_chen_plot, aes(x = as.factor(downsample), y = mn, fill = method)) +
  geom_boxplot(outlier.size = 0.1) +
  theme_bw() +
  theme(legend.position = "none") +
  scale_fill_manual(values = colors.use) +
  ylab("Mean kNN celltype purity") +
  xlab("Average counts per cell")

metrics <- (p2 / sil_chen) + plot_layout(guides = "collect")

pp <- (p1 / metrics) + plot_layout(heights = c(3, 1)) & theme(axis.text = element_text(size=8))
ggsave(filename = "figures/dimreduc_chen.png", plot = pp, width = 10, height = 16, units = "in", dpi = 400)

R ggplot2 dplyr Seurat patcHwork Signac SeuratDisk cluster paletteer RANN From line 1 of pbmc_downsampling/evaluate_dimreducs.R

library(Signac)
library(cisTopic)

# PBMC dataset
ds_level <- rev(seq(0.2, 1, 0.2))
for (d in ds_level) {
  counts_use <- readRDS(file = paste0("data/pbmc/downsamples/", d, ".rds"))
  rownames(counts_use) <- GRangesToString(StringToGRanges(rownames(counts_use)), sep = c(":", "-"))
  cisTopicObj <- createcisTopicObject(count.matrix = counts_use, project.name = 'PBMC')

  # CGS model
  time.start <- Sys.time()
  cgs <- runCGSModels(object = cisTopicObj)
  elapsed.cgs <- as.numeric(Sys.time() - time.start, unit = "secs")
  cgs <- selectModel(object = cgs, type = "maximum")

  # WarpLDA model
  time.start <- Sys.time()
  wrp <- runWarpLDAModels(object = cisTopicObj)
  elapsed.warp <- as.numeric(Sys.time() - time.start, unit = "secs")
  wrp <- selectModel(object = wrp, type = "derivative")

  # extract coordinates
  dimreduc_cgs <- cgs@selected.model$document_expects
  dimreduc_wrp <- wrp@selected.model$document_expects

  # save
  saveRDS(object = dimreduc_cgs, file = paste0("data/pbmc/downsamples/cistopic_cgs_", d, ".rds"))
  saveRDS(object = dimreduc_wrp, file = paste0("data/pbmc/downsamples/cistopic_warp_", d, ".rds"))

  write(
    x = paste0(elapsed.cgs, "\t", d),
    file = "data/pbmc/downsamples/cistopic_cgs_runtime.txt",
    append = TRUE
  )
  write(
    x = paste0(elapsed.warp, "\t", d),
    file = "data/pbmc/downsamples/cistopic_warp_runtime.txt",
    append = TRUE
  )
}

# Chen dataset
bm_datasets <- c("250", "500", "1000", "2500", "5000")
filepath <- "data/chen/scATAC-benchmarking-master/Synthetic_Data/BoneMarrow_cov"
for (d in bm_datasets) {
  counts_use <- readRDS(file = paste0(filepath, d, "/input/bonemarrow_cov", d, ".rds"))
  rownames(counts_use) <- GRangesToString(StringToGRanges(rownames(counts_use), sep = c("_", "_")), sep = c(":", "-"))
  cisTopicObj <- createcisTopicObject(count.matrix = counts_use, project.name = 'Simulated')

  # CGS model
  time.start <- Sys.time()
  cgs <- runCGSModels(object = cisTopicObj)
  elapsed.cgs <- as.numeric(Sys.time() - time.start, unit = "secs")
  cgs <- selectModel(object = cgs, type = "maximum")

  # WarpLDA model
  time.start <- Sys.time()
  wrp <- runWarpLDAModels(object = cisTopicObj)
  elapsed.warp <- as.numeric(Sys.time() - time.start, unit = "secs")
  wrp <- selectModel(object = wrp, type = "derivative")

  # extract coordinates
  dimreduc_cgs <- cgs@selected.model$document_expects
  dimreduc_wrp <- wrp@selected.model$document_expects

  # save
  saveRDS(object = dimreduc_cgs, file = paste0("data/chen/embeddings/cistopic_cgs_", d, ".rds"))
  saveRDS(object = dimreduc_wrp, file = paste0("data/chen/embeddings/cistopic_warp_", d, ".rds"))

  write(
    x = paste0(elapsed.cgs, "\t", d),
    file = "data/chen/embeddings/cistopic_cgs_runtime.txt",
    append = TRUE
  )
  write(
    x = paste0(elapsed.warp, "\t", d),
    file = "data/chen/embeddings/cistopic_warp_runtime.txt",
    append = TRUE
  )
}

R Signac From line 1 of pbmc_downsampling/run_cistopic.R

library(Signac)
library(Seurat)

pbmc <- readRDS("objects/pbmc.rds")
atac.assay <- "ATAC"
method_use <- c(1, 2, 3, 4)
DefaultAssay(pbmc) <- atac.assay
obj <- pbmc

# pbmc multiome
ds_level <- rev(seq(0.2, 1, 0.2))
for (d in ds_level) {
  counts_use <- readRDS(file = paste0("data/pbmc/downsamples/", d, ".rds"))
  obj <- SetAssayData(obj, slot = "counts", assay = atac.assay, new.data = counts_use)
  for (m in method_use) {
    key <- paste(d, m, sep = "_")
    message(key)
    obj <- RunTFIDF(object = obj, assay = atac.assay, method = m)
    time.start <- Sys.time()
    obj <- RunSVD(obj, features = rownames(x = obj))
    elapsed <- as.numeric(Sys.time() - time.start, unit = "secs")
    emb <- Embeddings(object = obj, reduction = "lsi")
    saveRDS(object = emb, file = paste0("data/pbmc/downsamples/lsi_", key, ".rds"))
    write(
      x = paste0(elapsed, "\t", d, "\t", m),
      file = "data/pbmc/downsamples/lsi_runtime.txt",
      append = TRUE
    )
  }
}

# simulated bone marrow
bm_datasets <- c("250", "500", "1000", "2500", "5000")
filepath <- "data/chen/scATAC-benchmarking-master/Synthetic_Data/BoneMarrow_cov"
for (d in bm_datasets) {
  counts_use <- readRDS(file = paste0(filepath, d, "/input/bonemarrow_cov", d, ".rds"))
  obj <- CreateSeuratObject(counts = counts_use, min.cells = -1, min.features = -1, assay = "ATAC")
  for (m in method_use) {
    key <- paste(d, m, sep = "_")
    message(key)
    obj <- RunTFIDF(object = obj, assay = atac.assay, method = m)
    time.start <- Sys.time()
    obj <- RunSVD(obj, features = rownames(x = obj))
    elapsed <- as.numeric(Sys.time() - time.start, unit = "secs")
    emb <- Embeddings(object = obj, reduction = "lsi")
    saveRDS(object = emb, file = paste0("data/chen/embeddings/lsi_", key, ".rds"))
    write(
      x = paste0(elapsed, "\t", d, "\t", m),
      file = "data/chen/embeddings/lsi_runtime.txt",
      append = TRUE
    )
  }
}

R Seurat Signac From line 1 of pbmc_downsampling/run_lsi.R

library(Signac)
library(Seurat)
library(DropletUtils)

set.seed(1234)

atac.assay <- "ATAC"
pbmc <- readRDS("objects/pbmc.rds")
counts <- GetAssayData(pbmc, slot = "counts", assay = atac.assay)

# downsample counts
ds_level <- rev(seq(0.2, 1, 0.2))

for (d in ds_level) {
  counts_use <- downsampleMatrix(x = counts, prop = d)
  saveRDS(object = counts_use, file = paste0("data/pbmc/downsamples/", d, ".rds"))
}

R Seurat Signac DropletUtils From line 1 of pbmc_downsampling/run_pbmc_downsample.R

library(Matrix)

dir.create(file.path("data/pbmc/downsamples/pbmc_scale"), showWarnings = FALSE)
# PBMC dataset
ds_level <- rev(seq(0.2, 1, 0.2))
for (d in ds_level) {
  counts_use <- readRDS(file = paste0("data/pbmc/downsamples/", d, ".rds"))
  writeMM(obj = counts_use, file = "data/pbmc/downsamples/pbmc_scale/counts.mtx")
  peaks <- rownames(counts_use)
  peaks <- gsub("-", "_", peaks)
  write.table(
    x = peaks,
    file = "data/pbmc/downsamples/pbmc_scale/peaks.txt",
    append = FALSE,
    row.names = FALSE,
    col.names = FALSE,
    quote = FALSE
  )
  barcodes <- colnames(counts_use)
  write.table(
    x = barcodes,
    file = "data/pbmc/downsamples/pbmc_scale/barcodes.txt",
    append = FALSE,
    row.names = FALSE,
    col.names = FALSE,
    quote = FALSE
  )
  time.start <- Sys.time()
  cmd <- paste0("SCALE.py -d data/pbmc/downsamples/pbmc_scale --min_peaks 1 -o data/pbmc/downsamples/scale_", d)
  system(command = cmd, wait = TRUE, ignore.stderr = FALSE, ignore.stdout = FALSE)
  elapsed <- as.numeric(Sys.time() - time.start, unit = "secs")
  write(
    x = paste0(elapsed, "\t", d),
    file = "data/pbmc/downsamples/scale_runtime.txt",
    append = TRUE
  )
}

dir.create(file.path("data/chen/embeddings/scale"), showWarnings = FALSE)
# Chen dataset
bm_datasets <- c("250", "500", "1000", "2500", "5000")
filepath <- "data/chen/scATAC-benchmarking-master/Synthetic_Data/BoneMarrow_cov"
for (d in bm_datasets) {
  counts_use <- readRDS(file = paste0(filepath, d, "/input/bonemarrow_cov", d, ".rds"))
  writeMM(obj = counts_use, file = "data/chen/embeddings/scale/counts.mtx")
  peaks <- rownames(counts_use)
  write.table(
    x = peaks,
    file = "data/chen/embeddings/scale/peaks.txt",
    append = FALSE,
    row.names = FALSE,
    col.names = FALSE,
    quote = FALSE
  )
  barcodes <- colnames(counts_use)
  write.table(
    x = barcodes,
    file = "data/chen/embeddings/scale/barcodes.txt",
    append = FALSE,
    row.names = FALSE,
    col.names = FALSE,
    quote = FALSE
  )
  time.start <- Sys.time()
  cmd <- paste0("SCALE.py -d data/chen/embeddings/scale -o data/chen/embeddings/scale_", d)
  system(command = cmd, wait = TRUE, ignore.stderr = FALSE, ignore.stdout = FALSE)
  elapsed <- as.numeric(Sys.time() - time.start, unit = "secs")
  write(
    x = paste0(elapsed, "\t", d),
    file = "data/chen/embeddings/scale_runtime.txt",
    append = TRUE
  )
}

R Matrix From line 1 of pbmc_downsampling/run_scale.R

library(Signac)
library(SnapATAC)

# PBMC dataset
ds_level <- rev(seq(0.2, 1, 0.2))
for (d in ds_level) {
  counts_use <- readRDS(file = paste0("data/pbmc/downsamples/", d, ".rds"))
  counts_use <- t(x = counts_use)
  snap <- createSnapFromBmat(
    mat = counts_use,
    barcodes = rownames(x = counts_use),
    bins = StringToGRanges(regions = colnames(x = counts_use))
  )
  snap <- makeBinary(snap, mat = "bmat")
  time.start <- Sys.time()
  snap <- runDiffusionMaps(
    obj = snap,
    input.mat = "bmat",
    num.eigs = 50
  )
  elapsed <- as.numeric(Sys.time() - time.start, unit = "secs")
  reducedMatrix <- snap@smat@dmat
  saveRDS(object = reducedMatrix, file = paste0("data/pbmc/downsamples/snapatac_", d, ".rds"))
  write(
    x = paste0(elapsed, "\t", d),
    file = "data/pbmc/downsamples/snapatac_runtime.txt",
    append = TRUE
  )
}

# Chen dataset
bm_datasets <- c("250", "500", "1000", "2500", "5000")
filepath <- "data/chen/scATAC-benchmarking-master/Synthetic_Data/BoneMarrow_cov"
for (d in bm_datasets) {
  counts_use <- readRDS(file = paste0(filepath, d, "/input/bonemarrow_cov", d, ".rds"))
  counts_use <- t(x = counts_use)
  snap <- createSnapFromBmat(
    mat = counts_use,
    barcodes = rownames(x = counts_use),
    bins = StringToGRanges(regions = colnames(x = counts_use), sep = c("_", "_"))
  )
  snap <- makeBinary(snap, mat = "bmat")
  time.start <- Sys.time()
  snap <- runDiffusionMaps(
    obj = snap,
    input.mat = "bmat",
    num.eigs = 50
  )
  elapsed <- as.numeric(Sys.time() - time.start, unit = "secs")
  reducedMatrix <- snap@smat@dmat
  saveRDS(object = reducedMatrix, file = paste0("data/chen/embeddings/snapatac_", d, ".rds"))
  write(
    x = paste0(elapsed, "\t", d),
    file = "data/chen/embeddings/snapatac_runtime.txt",
    append = TRUE
  )
}

R Signac From line 1 of pbmc_downsampling/run_snapatac.R

library(Seurat)
library(Signac)
library(ggplot2)
library(patchwork)
library(GenomicRanges)


pbmc <- readRDS("objects/pbmc.rds")
ident.use <- "CD14 Mono"

obj <- pbmc[, Idents(pbmc) == ident.use]

ds.level <- seq(50, 2850, 100)
pk.list <- list()
cp.list <- list()
for (i in seq_along(ds.level)) {
  set.seed(1234)
  cells.use <- sample(x = colnames(obj), size = ds.level[[i]], replace = FALSE)
  obj.ds <- obj[, cells.use]
  obj.ds$ds <- paste0(ds.level[[i]], " cells")
  pk <- CallPeaks(
    object = obj.ds,
    group.by = "celltype",
    additional.args = "--max-gap 50"
  )
  cp.ds <- CoveragePlot(
    object = obj.ds,
    region = "LYZ",
    group.by = "ds",
    extend.upstream = 6000,
    extend.downstream = 8000,
    peaks = FALSE,
    ranges = pk,
    ymax = 260
  )
  pk.list[[as.character(i)]] <- pk
  cp.list[[as.character(i)]] <- cp.ds
}

# find overlaps with highest sampling
pk.highest <- pk.list[[length(pk.list)]]

olap <- c()
for (i in seq_along(pk.list)) {
  ol <- sum(countOverlaps(query = pk.list[[i]], pk.highest)) / length(pk.highest)
  olap[[i]] <- ol
}

df <- data.frame(x = unlist(olap), cells = ds.level)
p <- ggplot(data = df, mapping = aes(x = cells, y = x)) +
  geom_point() +
  geom_smooth(se = FALSE) +
  xlab("Number of cells") +
  ylab("Fraction of peaks recovered") +
  theme_bw() +
  ylim(c(0, 1)) +
  scale_x_continuous(breaks=seq(0, 2850, 200))

cp.use <- cp.list[c(29, 10, 1)]
p2 <- wrap_plots(cp.use, ncol = 1)
fig <- (p | p2) + plot_layout(heights = c(1, 3))
ggsave(filename = "figures/peakcalls.png", plot = fig, height = 10, width = 16)

R ggplot2 Seurat GenomicRanges patcHwork Signac From line 1 of code/peak_calling.R

library(Signac)
library(Seurat)
library(GenomicRanges)

frags <- "data/pbmc_atac/fragments.bed.gz"

fragment.counts <- CountFragments(frags)
cells.use <- fragment.counts[fragment.counts$frequency_count > 1000, "CB"]

fragments <- CreateFragmentObject(
  path = frags,
  cells = cells.use,
  validate.fragments = FALSE
)

peaks <- CallPeaks(fragments, macs2.path = "/home/stuartt/miniconda3/envs/signac/bin/macs2")
peaks <- subsetByOverlaps(peaks, blacklist_hg19, invert = TRUE)

counts <- FeatureMatrix(
  fragments = fragments,
  features = peaks,
  cells = cells.use
)

pbmc <- CreateSeuratObject(
  counts = CreateChromatinAssay(
    counts = counts,
    fragments = fragments
  ),
  assay = "ATAC"
)

pbmc <- pbmc[, pbmc$nCount_ATAC > 1000]

peaks <- granges(pbmc)
peaks <- as.data.frame(peaks)
write.table(x = peaks, file = "data/pbmc_atac/peaks.bed", sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE)
writeLines(text = colnames(x = pbmc), con = "data/pbmc_atac/cells.txt")

# cluster and make UMAP
pbmc <- FindTopFeatures(pbmc, min.cutoff = 10)
pbmc <- RunTFIDF(pbmc)
pbmc <- RunSVD(pbmc)
pbmc <- RunUMAP(pbmc, reduction = "lsi", dims = 2:30)
pbmc <- FindNeighbors(pbmc, reduction = "lsi", dims = 2:30)
pbmc <- FindClusters(pbmc, algorithm = 3, resolution = 0.5)

saveRDS(object = pbmc, file = "objects/pbmc_atac.rds")

R macs2 Seurat GenomicRanges Signac From line 1 of code/process_pbmc_atac.R

options(repos = c("CRAN" = "https://cran.rstudio.com/"))
options(Ncpus = 4)

install.packages(
  pkgs = c("remotes", "BiocManager", "tidyr", "dplyr", "RANN", "cluster", "ROCR",
           "patchwork", "mclust", "paletteer", "ggthemes", "dplyr", "arrow")
)
BiocManager::install()
BiocManager::install(pkgs = c("GenomeInfoDbData", "HSMMSingleCell", "GO.db", "DelayedArray"))
setRepositories(ind = 1:2)

install.packages("Seurat", dependencies = TRUE)
install.packages("Signac", dependencies = TRUE)
remotes::install_github(repo = "jlmelville/uwot")
remotes::install_github(repo = "mojaveazure/seurat-disk")

BiocManager::install(
  pkgs = c("EnsDb.Mmusculus.v79",
           "BSgenome.Mmusculus.UCSC.mm10",
           "TFBSTools",
           "JASPAR2020",
           "EnsDb.Hsapiens.v86",
           "BSgenome.Hsapiens.UCSC.hg38",
           "EnsDb.Hsapiens.v75",
           "BSgenome.Hsapiens.UCSC.hg19",
           "DropletUtils",
           "chromVAR",
           "HDF5Array",
           "DelayedMatrixStats",
           "batchelor",
           "scater"
           )
  )

# snapatac
install.packages(c("doSNOW", "plot3D"))
devtools::install_github("r3fang/SnapATAC")

# archr
devtools::install_github("GreenleafLab/ArchR", ref="release_1.0.1", repos = BiocManager::repositories())

# cistopic
devtools::install_github("aertslab/RcisTarget")
devtools::install_github("aertslab/AUCell")
devtools::install_github("aertslab/cisTopic")

R Seurat Signac From line 1 of master/install_r_packages.R

shell:
    """
    Rscript install_r_packages.R
    touch install.done
    """

SnakeMake From line 19 of master/Snakefile

shell:
    """
    wget -i {input} -P data/{wildcards.dset}
    touch data/{wildcards.dset}/done.txt
    """

SnakeMake From line 34 of master/Snakefile

shell:
    """
    wget -i {input} -P data/pbmc_atac
    touch data/pbmc_atac/done.txt
    """

SnakeMake From line 47 of master/Snakefile

shell:
    """
    wget -i {input} -P data/gtex
    cd data/gtex
    tar -xvf GTEx_v8_finemapping_CAVIAR.tar
    rm GTEx_v8_finemapping_CAVIAR.tar
    """

SnakeMake From line 60 of master/Snakefile

shell:
    """
    wget -i {input} -P data/chen
    cd data/chen
    unzip master.zip
    rm master.zip
    """

SnakeMake From line 73 of master/Snakefile

shell:
    """
    cd data/pbmc_atac
    gzip -d *.tsv.gz
    awk 'BEGIN {{FS=OFS="\\t"}} {{print $1,$2,$3,"10kng_"$4,$5}}' atac_pbmc_10k_nextgem_fragments.tsv > 1.bed
    awk 'BEGIN {{FS=OFS="\\t"}} {{print $1,$2,$3,"10k_"$4,$5}}' atac_pbmc_10k_v1_fragments.tsv > 2.bed
    awk 'BEGIN {{FS=OFS="\\t"}} {{print $1,$2,$3,"5kng_"$4,$5}}' atac_pbmc_5k_nextgem_fragments.tsv > 3.bed
    awk 'BEGIN {{FS=OFS="\\t"}} {{print $1,$2,$3,"5k_"$4,$5}}' atac_pbmc_5k_v1_fragments.tsv > 4.bed
    cat *.bed > frags.bed
    sort -k1,1 -k2,2n frags.bed > fragments.bed
    bgzip -@ {threads} fragments.bed
    tabix -p bed fragments.bed.gz
    rm *.bed *.tsv
    """

SnakeMake tabix From line 90 of master/Snakefile

shell: "Rscript code/process_pbmc_atac.R"

SnakeMake From line 112 of master/Snakefile

shell: "Rscript code/process_{wildcards.dset}.R"

SnakeMake From line 121 of master/Snakefile

shell: "Rscript code/downsampling_code/downsample.R"

SnakeMake From line 134 of master/Snakefile

shell: "code/downsampling_code/downsample_archr.R"

SnakeMake From line 145 of master/Snakefile

shell: "Rscript code/downsampling_code/get_annotations.R"

SnakeMake From line 154 of master/Snakefile

shell: "bash code/biccn_downsampling/benchmark.sh"

SnakeMake From line 164 of master/Snakefile

shell: "bash code/pbmc_atac_downsampling/benchmark.sh"

SnakeMake From line 174 of master/Snakefile

shell: "bash code/pbmc_atac_downsampling/benchmark_archr.sh"

SnakeMake From line 181 of master/Snakefile

shell: "bash code/biccn_downsampling/benchmark_archr.sh"

SnakeMake From line 188 of master/Snakefile

shell:
    """
    Rscript code/biccn_downsampling/collate_timings.R
    Rscript code/pbmc_atac_downsampling/collate_timings.R
    """

SnakeMake From line 198 of master/Snakefile

shell: "Rscript code/pbmc_downsampling/run_pbmc_downsample.R"

SnakeMake From line 211 of master/Snakefile

shell: "Rscript code/pbmc_downsampling/run_lsi.R"

SnakeMake From line 222 of master/Snakefile

shell: "Rscript code/pbmc_downsampling/run_cistopic.R"

SnakeMake From line 235 of master/Snakefile

shell: "Rscript code/pbmc_downsampling/run_snapatac.R"

SnakeMake From line 246 of master/Snakefile

shell: "Rscript code/pbmc_downsampling/run_scale.R"

SnakeMake From line 257 of master/Snakefile

shell: "Rscript code/pbmc_downsampling/evaluate_dimreducs.R"

SnakeMake From line 272 of master/Snakefile

shell:
    """
    Rscript code/create_biccn_signac.R
    Rscript code/create_biccn_archr.R
    Rscript code/create_pbmc_atac_signac.R
    Rscript code/create_pbmc_atac_archr.R
    """

SnakeMake From line 285 of master/Snakefile

shell:
    """
    Rscript code/link_peaks.R
    """

SnakeMake From line 301 of master/Snakefile

shell:
    """
    Rscript code/analyze_pbmc.R
    touch eqtl.done
    """

SnakeMake From line 315 of master/Snakefile

shell:
    """
    Rscript code/multimodal_label_transfer.R
    """

SnakeMake From line 328 of master/Snakefile

shell:
    """
    Rscript code/analyze_pbmc.R
    """

SnakeMake From line 342 of master/Snakefile

shell:
    """
    Rscript code/figure2.R
    """

SnakeMake From line 354 of master/Snakefile

shell:
    """
    Rscript code/figure4.R
    """

SnakeMake From line 364 of master/Snakefile

shell:
    """
    Rscript code/figure5.R
    """

SnakeMake From line 376 of master/Snakefile

shell:
    """
    Rscript code/peak_calling.R
    """

SnakeMake From line 386 of master/Snakefile

shell:
    """
    Rscript code/clustering.R
    """

SnakeMake From line 396 of master/Snakefile

Code to reproduce analyses shown in the Signac paper

Help improve this workflow!

Signac paper

Code Snippets

Comments

Support

Free

Related Workflows

public

public

public

public

public

public