BioWorkflows: Nanobody-tethered transposition followed by sequencing

library(Signac)
library(Seurat)
library(SummarizedExperiment)
library(future)
library(GenomicRanges)
library(EnsDb.Hsapiens.v86)

plan("multicore", workers = 8)
options(future.globals.maxSize = 100 * 1024 ^ 3)

### Load processed object for the metadata
orig.obj <- readRDS("data/bmmc_atac/scATAC-Healthy-Hematopoiesis-191120.rds")
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"

# get all fragment files
frag.files <- list.files("data/bmmc_atac/", pattern = "^scATAC_", full.names = TRUE)
frag.files <- paste0(frag.files, "/fragments.tsv.gz")
names(frag.files) <-  list.files("data/bmmc_atac/", pattern = "^scATAC_")

# call peaks on each and combine
pk.all <- list()
for (i in seq_along(frag.files)) {
  pks <- CallPeaks(
    object = frag.files[[i]]
  )
  pk.all[[i]] <- pks
}

gr.combined <- Reduce(f = c, x = pk.all)
gr.combined <- GenomicRanges::reduce(gr.combined)
gr.combined <- keepStandardChromosomes(gr.combined, pruning.mode = "coarse")

# quantify and create objects
md <- colData(orig.obj)

obj.list <- list()
for (i in seq_along(frag.files)) {
  obj_ident <- gsub(pattern = "scATAC_", x = names(frag.files)[[i]], replacement = "")
  cells.keep <- md[md$Group == obj_ident, "Barcode"]
  frags <- CreateFragmentObject(
    path = frag.files[[i]],
    cells = cells.keep
  )
  peakcounts <- FeatureMatrix(
    fragments = frags,
    features = gr.combined,
    cells = cells.keep
  )
  obj <- CreateChromatinAssay(counts = peakcounts, fragments = frags)
  obj <- CreateSeuratObject(counts = obj, assay = "ATAC")
  obj <- RenameCells(obj, add.cell.id = obj_ident)
  obj.list[[i]] <- obj
}

rownames(md) <- paste0(md$Group, "_", md$Barcode)
md <- as.data.frame(md)
bmmc <- merge(x = obj.list[[1]], y = obj.list[2:length(obj.list)])
bmmc <- AddMetaData(object = bmmc, metadata = md)
Annotation(bmmc) <- annot

bmmc$tissue <- sapply(strsplit(bmmc$Group, "_"), `[[`, 1)
bmmc$sample <- sapply(strsplit(bmmc$Group, "_"), `[[`, 2)

# remove PBMC
Idents(bmmc) <- "tissue"
bmmc <- subset(bmmc, idents = "PBMC", invert = TRUE)

bmmc <- NucleosomeSignal(bmmc)

# process
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc)
bmmc <- RunSVD(bmmc)
bmmc <- RunUMAP(bmmc, reduction = "lsi", dims = 2:40)
ga <- GeneActivity(bmmc)
bmmc[["GA"]] <- CreateAssayObject(counts = ga)

DefaultAssay(bmmc) <- "GA"
bmmc <- NormalizeData(bmmc)

DefaultAssay(bmmc) <- "ATAC"

# integrate embeddings
atac <- SplitObject(bmmc, split.by = "Group")
atac <- lapply(atac, RunSVD, scale.embeddings = FALSE)

atac_anchors <- FindIntegrationAnchors(
  object.list = atac,
  reduction = "rlsi",
  dims = 2:30
)

atac_integrated <- IntegrateEmbeddings(
  anchorset = atac_anchors,
  new.reduction.name = "integrated_lsi",
  reductions = bmmc[["lsi"]],
  dims.to.integrate = 1:30
)
bmmc[["integrated_lsi"]] <- atac_integrated[["integrated_lsi"]]

saveRDS(object = bmmc, file = "objects/bmmc_atac.rds")

R Seurat GenomicRanges EnsDb.Hsapiens.v86 SummarizedExperiment Signac future From line 1 of bmmc_atac/process_bmmc_atac.r

library(Seurat)
library(Signac)
library(ggplot2)
library(patchwork)

bmmc <- readRDS("objects/bmmc_dual.rds")

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

p1 <- DimPlot(bmmc, reduction = "umap.me3", group.by = "celltype", pt.size = 0.1, label = TRUE, label.size = 3, repel = TRUE) +
  theme_void() +
  theme(text = element_text(size = 10), legend.position = "none") +
  ggtitle("H3K27me3")
p2 <- DimPlot(bmmc, reduction = "umap.ac", group.by = "celltype", pt.size = 0.1, label = TRUE, label.size = 3, repel = TRUE) +
  theme_void() +
  theme(text = element_text(size = 10), legend.position = "none") +
  ggtitle("H3K27ac") +
  NoLegend()
p3 <- DimPlot(bmmc, reduction = "umap.wnn", group.by = "celltype", label = TRUE, label.size = 4, repel = TRUE) + 
  ggtitle("BMMCs: H3K27me3 + H3K27ac") +
  theme_classic() +
  NoLegend() +
  ylab('UMAP 2') + xlab("UMAP 1")

pp <- ((p1 / p2) | p3) + plot_layout(widths = c(1, 2))
ggsave(filename = "plots/bmmc/dimplots_bmmc.png", plot = pp, height = 6, width = 8)

# fragment counts
fm <- CountFragments(
  fragments = Fragments(bmmc[["me3"]])[[1]]@path,
  cells = colnames(bmmc)
)

fa <- CountFragments(
  fragments = Fragments(bmmc[["ac"]])[[1]]@path,
  cells = colnames(bmmc)
)

mean(fm$frequency_count) # 1217
mean(fa$frequency_count) # 326
sd(fm$frequency_count)  # 1274
sd(fa$frequency_count)  # 334

# create plot
fm$mark <- "H3K27me3"
fa$mark <- "H3K27ac"
all_counts <- rbind(fm, fa)

p0 <- ggplot(all_counts, aes(mark, frequency_count, fill = mark)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  ylab("Fragments") +
  xlab("Mark") +
  ggtitle("Total fragments") +
  theme(legend.position = "none") +
  scale_fill_manual(values = c(colormap$H3K27ac, colormap$H3K27me3))

ggsave(filename = "plots/bmmc/fragments_bmmc.png", plot = p0, height = 3, width = 3)

R ggplot2 Seurat patcHwork Signac From line 1 of bmmc_dual/analysis.R

library(Signac)
library(Seurat)
library(GenomicRanges)
library(ggplot2)


bmmc <- readRDS("objects/bmmc_dual.rds")

# ENCODE peaks
pk_ac <- read.table("data/encode/ENCFF832RWT.bed.gz")
pk_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz")

pk_ac <- makeGRangesFromDataFrame(df = pk_ac, seqnames.field = "V1", start.field = "V2", end.field = "V3")
pk_me3 <- makeGRangesFromDataFrame(df = pk_me3, seqnames.field = "V1", start.field = "V2", end.field = "V3")

t_bmmc_ac <- CountFragments(fragments = Fragments(bmmc[['ac']])[[1]]@path, cells = colnames(bmmc))
totals_bmmc_ac <- t_bmmc_ac$frequency_count
names(totals_bmmc_ac) <- t_bmmc_ac$CB

t_bmmc_me3 <- CountFragments(fragments = Fragments(bmmc[['me3']])[[1]]@path, cells = colnames(bmmc))
totals_bmmc_me3 <- t_bmmc_me3$frequency_count
names(totals_bmmc_me3) <- t_bmmc_me3$CB

# total counts in peak regions
get_total <- function(obj, regions) {
  counts <- FeatureMatrix(
    fragments = Fragments(obj),
    features = regions,
    cells = colnames(obj)
  )
  return(colSums(counts))
}

# ntt
in_peak_bmmc_ac_ac <- get_total(obj = bmmc[['ac']], regions = pk_ac)
in_peak_bmmc_ac_me <- get_total(obj = bmmc[['ac']], regions = pk_me3)
in_peak_bmmc_me_me <- get_total(obj = bmmc[['me3']], regions = pk_me3)
in_peak_bmmc_me_ac <- get_total(obj = bmmc[['me3']], regions = pk_ac)

# frip
make_df <- function(totals, inpeak, assay, pk, dataset) {
  d <- data.frame(
    cell = names(totals),
    total = totals,
    in_peak = inpeak[names(totals)],
    assay = assay,
    peak = pk,
    dataset = dataset
  )
  d$FRIP <- d$in_peak / d$total
  return(d)
}

df <- rbind(
  make_df(totals_bmmc_ac, in_peak_bmmc_ac_ac, "H3K27ac", "H3K27ac", "NTT"),
  make_df(totals_bmmc_ac, in_peak_bmmc_ac_me, "H3K27ac", "H3K27me3", "NTT"),
  make_df(totals_bmmc_me3, in_peak_bmmc_me_me, "H3K27me3", 'H3K27me3', "NTT"),
  make_df(totals_bmmc_me3, in_peak_bmmc_me_ac, "H3K27me3", "H3K27ac", "NTT")
)

df_filt <- df[df$assay == df$peak, ]

p <- ggplot(df_filt, aes(x = peak, y = FRIP, fill = peak)) +
  geom_boxplot(outlier.size = 0.2) +
  scale_fill_manual(values = c("#F98401", "#D3145A")) +
  theme_bw() +
  ylab("FRiP") +
  xlab("Assay") +
  ggtitle("FRiP") +
  theme(legend.position = "none")

ggsave("plots/bmmc/frip_bmmc.png", plot = p, height = 4, width = 3)
saveRDS(object = df_filt, file = "data/bmmc_dual/frip.rds")

mean(df_filt[df_filt$peak == "H3K27me3", "FRIP" ]) # 0.18
sd(df_filt[df_filt$peak == "H3K27me3", "FRIP" ]) # 0.09
mean(df_filt[df_filt$peak == "H3K27ac", "FRIP" ]) # 0.26
sd(df_filt[df_filt$peak == "H3K27ac", "FRIP" ]) # 0.09

R ggplot2 Seurat GenomicRanges Signac From line 1 of bmmc_dual/frip_bmmc.R

library(Seurat)
library(Signac)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(future)
library(GenomicRanges)


args <- commandArgs(trailingOnly = TRUE)
threads <- args[[1]]
outfile <- args[[2]]

plan("multicore", workers = as.integer(threads))
options(future.globals.maxSize = Inf)

# annotation
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"
genome.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:23]

fpath <- "data/bmmc_dual/DNA"
fragfiles <- list.files(
  path = fpath,
  pattern = "*.bed.gz$",
  full.names = TRUE
)
fname <- gsub(pattern = paste0(fpath, "/"),
              replacement = "",
              x = fragfiles)
fname <- gsub(pattern = ".bed.gz",
              replacement = "",
              x = fname)
names(fragfiles) <- fname

total_counts <- CountFragments(fragments = as.list(fragfiles))
cells.keep <- total_counts[total_counts$frequency_count > 150, "CB"]

frags_me3 <- CreateFragmentObject(
  path = fragfiles[['H3K27me3']],
  cells = cells.keep
)
frags_ac <- CreateFragmentObject(
  path = fragfiles[['H3K27ac']],
  cells = cells.keep
)

counts_me3 <- AggregateTiles(
  object = frags_me3,
  cells = cells.keep,
  min_counts = 1,
  binsize = 5000,
  genome = genome.use
)

counts_ac <- AggregateTiles(
  object = frags_ac,
  cells = cells.keep,
  min_counts = 3,
  binsize = 1000,
  genome = genome.use
)

# remove features overlapping blacklist
remove_blacklist <- function(counts) {
  olap <- findOverlaps(query = StringToGRanges(rownames(counts)), subject = blacklist_hg38_unified)
  is_bl <- queryHits(olap)
  counts <- counts[setdiff(1:nrow(counts), is_bl), ]
  return(counts)
}

# create seurat object
me3 <- CreateChromatinAssay(
  counts = remove_blacklist(counts_me3),
  fragments = fragfiles[['H3K27me3']],
  annotation = annot
)

ac <- CreateChromatinAssay(
  counts = remove_blacklist(counts_ac),
  fragments = fragfiles[['H3K27ac']],
  annotation = annot
)

bmmc <- CreateSeuratObject(counts = me3, assay = "me3")
bmmc[['ac']] <- ac

bmmc <- subset(bmmc, subset = nCount_me3 < 10000 &
                 nCount_ac < 10000 &
                 nCount_me3 > 100 &
                 nCount_ac > 75)

bmmc <- TSSEnrichment(bmmc, assay = "me3")
bmmc$tss.me3 <- bmmc$TSS.enrichment
bmmc <- TSSEnrichment(bmmc, assay = "ac")
bmmc$tss.ac <- bmmc$TSS.enrichment

bmmc <- NucleosomeSignal(bmmc, assay = "me3")
bmmc$nucleosome.me3 <- bmmc$nucleosome_signal
bmmc <- NucleosomeSignal(bmmc, assay = "ac")
bmmc$nucleosome.ac <- bmmc$nucleosome_signal

bmmc <- subset(bmmc, subset = tss.ac > 1)

DefaultAssay(bmmc) <- "me3"
feat.keep <- names(which(rowSums(bmmc, slot = 'counts') > 1)) # remove low-count features
bmmc[['me3']] <- subset(bmmc[['me3']], features = feat.keep)
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc, scale.factor = median(bmmc$nCount_me3))
bmmc <- RunSVD(bmmc, reduction.name = "lsi.me3", n = 100, scale.embeddings = FALSE)
bmmc <- RunUMAP(bmmc, reduction = "lsi.me3", reduction.name = "umap.me3", dims = 2:100)

DefaultAssay(bmmc) <- "ac"
feat.keep <- names(which(rowSums(bmmc, slot = 'counts') > 1))
bmmc[['ac']] <- subset(bmmc[['ac']], features = feat.keep)
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc, scale.factor = median(bmmc$nCount_ac))
bmmc <- RunSVD(bmmc, reduction.name = "lsi.ac", n = 100, scale.embeddings = FALSE)
bmmc <- RunUMAP(bmmc, reduction = "lsi.ac", reduction.name = "umap.ac", dims = 2:60)

bmmc <- FindMultiModalNeighbors(
  object = bmmc,
  reduction.list = list("lsi.me3", "lsi.ac"),
  dims.list = list(2:100, 2:60)
)
bmmc <- RunUMAP(bmmc, nn.name = "weighted.nn", reduction.name = "umap.wnn")
bmmc <- FindClusters(bmmc, graph.name = "wsnn", algorithm = 3, resolution = 4)

renaming <- list(
  '0' = "NK",
  '1' = "NK",
  '2' = "GMP/CLP",
  '3' = 'CD8 Naive',
  '4' = 'CD4 Naive',
  '5' = 'CD14 Mono',
  '6' = 'pDC',
  '7' = 'B',
  '8' = 'CD4 Memory',
  '9' = 'CD8 Memory',
  '10' = 'CD14 Mono',
  '11' = 'Pre-B',
  '12' = 'CD8 Memory',
  '13' = 'Late erythroid',
  '14' = 'Early erythroid',
  '15' = 'CD14 Mono',
  '16' = 'B',
  '17' = 'HSPC',
  '18' = 'CD14 Mono',
  '19' = 'CD14 Mono',
  '20' = 'Plasma',
  '21' = 'Early basophil'
)
Idents(bmmc) <- "seurat_clusters"
bmmc <- RenameIdents(bmmc, renaming)
bmmc$celltype <- Idents(bmmc)

saveRDS(object = bmmc, file = "objects/bmmc_dual.rds")

R Seurat BSgenome.Hsapiens.UCSC.hg38 GenomicRanges EnsDb.Hsapiens.v86 Signac future From line 1 of bmmc_dual/process.R

library(Signac)
library(Seurat)

obj <- readRDS("objects/bmmc_dual.rds")
outdir <- "data/bmmc_dual/bulk_fragments/"

# remove non-cell fragments for each assay for genome bin quant
DefaultAssay(obj) <- "ac"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "ac.bed.gz"),
  verbose = TRUE
)

DefaultAssay(obj) <- "me3"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "me3.bed.gz"),
  verbose = TRUE
)

R Quant Seurat Signac From line 1 of bmmc_dual/split.R

library(Signac)
library(Seurat)
library(ggplot2)
library(Matrix)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomeInfoDb)
library(RColorBrewer)
library(RcppRoll)

if (!requireNamespace("SeuratWrappers", quietly = TRUE)) {
  remotes::install_github('satijalab/seurat-wrappers')
}

library(SeuratWrappers)

if (!requireNamespace("monocle3", quietly = TRUE)) {
  setRepositories(ind=1:2)
  remotes::install_github('cole-trapnell-lab/leidenbase')
  remotes::install_github('cole-trapnell-lab/monocle3')
}

library(monocle3)
library(future)
plan("multicore", workers = 12)
options(future.globals.maxSize = Inf)


bmmc <- readRDS("objects/bmmc_dual.rds")
Idents(bmmc) <- "celltype"
cells.use <- c("HSPC", "GMP/CLP", "Pre-B", "B", "Plasma")

bcell <- bmmc[, bmmc$celltype %in% cells.use]

DefaultAssay(bcell) <- "me3"
feat.keep <- names(which(rowSums(bcell, slot = 'counts') > 1)) # remove low-count features
bcell[['me3']] <- subset(bcell[['me3']], features = feat.keep)
bcell <- FindTopFeatures(bcell, min.cutoff = 2)
bcell <- RunTFIDF(bcell, scale.factor = median(bcell$nCount_me3))
bcell <- RunSVD(bcell, reduction.name = "lsi.me3", scale.embeddings = FALSE)
bcell <- RunUMAP(bcell, reduction = "lsi.me3", reduction.name = 'umap.me3', dims = 2:20)

DefaultAssay(bcell) <- "ac"
feat.keep <- names(which(rowSums(bcell, slot = 'counts') > 1)) # remove low-count features
bcell[['ac']] <- subset(bcell[['ac']], features = feat.keep)
bcell <- FindTopFeatures(bcell, min.cutoff = 2)
bcell <- RunTFIDF(bcell, scale.factor = median(bcell$nCount_ac))
bcell <- RunSVD(bcell, reduction.name = "lsi.ac", scale.embeddings = FALSE)
bcell <- RunUMAP(bcell, reduction = "lsi.ac", reduction.name = 'umap.ac', dims = 2:20)

# WNN
bcell <- FindMultiModalNeighbors(
  object = bcell,
  reduction.list = list('lsi.me3', 'lsi.ac'),
  dims.list = list(2:20, 2:10)
)

bcell <- RunUMAP(bcell, nn.name = "weighted.nn", reduction.name = "umap.wnn")

## run monocle3
bcell@reductions$umap <- bcell@reductions$umap.wnn # needs to be named UMAP for monocle
bcell.cds <- as.cell_data_set(bcell)
bcell.cds <- cluster_cells(cds = bcell.cds, reduction_method = "UMAP")
bcell.cds <- learn_graph(bcell.cds, use_partition = FALSE)
root_cells <- WhichCells(bcell, idents = "HSPC")
bcell.cds <- order_cells(bcell.cds, reduction_method = "UMAP", root_cells = tail(root_cells, 10))

bcell <- AddMetaData(
  object = bcell,
  metadata = bcell.cds@principal_graph_aux@listData$UMAP$pseudotime,
  col.name = "B.cell.pseudotime"
)

fp1 <- FeaturePlot(bcell, "B.cell.pseudotime", reduction = "umap.wnn") +
  scale_color_viridis_c(option = "C") +
  theme_void() +
  ggtitle("") +
  theme(legend.position = "none")

ggsave("plots/bmmc/trajectory.png", plot = fp1, height = 4, width = 4)

## find features correlated with pseudotime
chr.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:22]

bins_me3 <- GenomeBinMatrix(
  fragments = Fragments(bcell[['me3']]),
  genome = chr.use,
  binsize = 10000,
  cells = colnames(bcell)
)
saveRDS(bins_me3, "bins_me3_bcell.rds")
bins_me3 <- readRDS("bins_me3_bcell.rds")

bins_ac <- GenomeBinMatrix(
  fragments = Fragments(bcell[['ac']]),
  genome = chr.use,
  binsize = 10000,
  cells = colnames(bcell)
)
saveRDS(bins_ac, "bins_ac_bcell.rds")
bins_ac <- readRDS("bins_ac_bcell.rds")

DefaultAssay(bcell) <- "me3"
bcell <- FindNeighbors(bcell, reduction = "lsi.me3", dims = 2:20, k = 50)
knn <- bcell[['me3_nn']]

knn_smooth <- function(counts, knn) {
  knn_norm <- knn / rowSums(knn)
  smoothed <- Matrix::tcrossprod(counts, knn_norm)
  return(smoothed)
}

smoothed_me3 <- knn_smooth(counts = bins_me3, knn = knn)
smoothed_ac <- knn_smooth(counts = bins_ac, knn = knn)

# filter regions based on total counts
keep_region <- (rowSums(smoothed_ac) > 5) & (rowSums(smoothed_me3) > 5)

smoothed_me3 <- smoothed_me3[keep_region, ]
smoothed_ac <- smoothed_ac[keep_region, ]

# depth normalize
smoothed_me3 <- t(t(smoothed_me3) / colSums(smoothed_me3))
smoothed_ac <- t(t(smoothed_ac) / colSums(smoothed_ac))

## correlation
pstime <- bcell$B.cell.pseudotime[colnames(bins_me3)]
peak_cor_me3 <- cor(t(as.matrix(smoothed_me3)), pstime)[, 1]
peak_cor_ac <- cor(t(as.matrix(smoothed_ac)), pstime)[, 1]

pv_me3 <- apply(smoothed_me3, 1, function(x) {
  cor.test(x, pstime)$p.value
})
pv_ac <- apply(smoothed_ac, 1, function(x) {
  cor.test(x, pstime)$p.value
})

cor_df <- data.frame(me3 = peak_cor_me3, ac = peak_cor_ac,
                     p_me3 = pv_me3, p_adj_me3 = p.adjust(pv_me3),
                     p_ac = pv_ac, p_adj_ac = p.adjust(pv_ac))
cor_df$diff <- cor_df$me3 - cor_df$ac

# cor 0.2 and difference 0.5
tc <- cor_df[abs(cor_df$diff) > 0.5 & abs(cor_df$me3) > 0.2 & abs(cor_df$ac) > 0.2, ]

# order cells by pseudotime
porder <- order(pstime)
ac_peaks <- smoothed_ac[rownames(tc), porder]
me3_peaks <- smoothed_me3[rownames(tc), porder]

# order based on which point in trajectory has max value using broader smoothing
rowsmoothed <- t(roll_sum(as.matrix(t(me3_peaks)), n = 200))
rownames(rowsmoothed) <- rownames(me3_peaks)
rowmax <- apply(rowsmoothed, 1, which.max)
me3_peaks <- me3_peaks[order(rowmax), ]
ac_peaks <- ac_peaks[order(rowmax), ]

rowsmoothed_ac <- t(roll_sum(as.matrix(t(ac_peaks)), n = 100))
mm_ac <- t(apply(as.matrix(rowsmoothed_ac), 2, rev))

rowsmoothed_me3 <- t(roll_sum(as.matrix(t(me3_peaks)), n = 100))
mm_me3 <- t(apply(as.matrix(rowsmoothed_me3), 2, rev))

ac_max <- quantile(mm_ac, probs = seq(0, 1, 0.05))
me3_max <- quantile(mm_me3, probs = seq(0, 1, 0.05))

me3_plot <- mm_me3
me3_plot[me3_plot > me3_max[['95%']]] <- me3_max[['95%']]
me3_plot[me3_plot < me3_max[['5%']]] <- me3_max[['5%']]

ac_plot <- mm_ac
ac_plot[ac_plot > ac_max[['95%']]] <- ac_max[['95%']]
ac_plot[ac_plot < ac_max[['5%']]] <- ac_max[['5%']]

cols.use <- rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(100))
png("./plots/hmap_ac.png", width = 5000, height = 5000, units = "px")
image(ac_plot, col = cols.use, main = "",
      xaxt = 'n', yaxt = 'n', ylab = "", xlab = "")
dev.off()

png("./plots/hmap_me3.png", width = 5000, height = 5000, units = "px")
image(me3_plot, col = cols.use, main = "",
      xaxt = 'n', yaxt = 'n', ylab = "", xlab = "")
dev.off()

# group peaks by whether they gain me3 or gain ac
tc <- tc[order(tc$me3, decreasing = TRUE), ]
peaks_me3 <- rownames(tc[tc$me3 > 0, ])
peaks_ac <- rownames(tc[tc$ac > 0, ])

cf_me3 <- ClosestFeature(bmmc, region = peaks_me3)
cf_ac <- ClosestFeature(bmmc, region = peaks_ac)

cf_me3 <- cf_me3[cf_me3$gene_biotype == "protein_coding", ]
cf_ac <- cf_ac[cf_ac$gene_biotype == "protein_coding", ]

maxdist <- 50000
closest_me3 <- unique(cf_me3[cf_me3$distance < maxdist, "gene_name"])
closest_ac <- unique(cf_ac[cf_ac$distance < maxdist, "gene_name"])

# save colorbar
png("plots/bmmc/colorbar_heatmap.png", width = 10, height = 4, units = "in", res = 400)
image(matrix(1:100), col=cols.use)
dev.off()

# RNA
bmmc_rna <- readRDS("objects/fullref.Rds")
ct.keep.rna <- c("HSC", "LMPP", "CLP", "pro B", "pre B", "transitional B", "Naive B", "Memory B", "Plasma")
bmmc_rna_bcell <- bmmc_rna[, bmmc_rna$celltype.l2 %in% ct.keep.rna]

# module score for each set of genes
bmmc_rna_bcell <- AddModuleScore(
  object = bmmc_rna_bcell,
  features = list(setdiff(closest_me3, closest_ac)),
  name = "Repressed"
)
bmmc_rna_bcell <- AddModuleScore(
  object = bmmc_rna_bcell,
  features = list(setdiff(closest_ac, closest_me3)),
  name = "Activated"
)

Idents(bmmc_rna_bcell) <- "celltype.l2"
levels(bmmc_rna_bcell) <- c("HSC", "LMPP", "CLP", "pro B", "pre B", "transitional B", "Naive B", "Memory B", "Plasma")

v1 <- VlnPlot(bmmc_rna_bcell, "Repressed1", pt.size = 0) + NoLegend() +
  geom_hline(yintercept = 0) +
  ylab("Module expression") +
  xlab("") +
  ylim(c(-0.15, 0.15)) +
  ggtitle("Repressed genes") +
  theme_classic() +
  theme(legend.position = "none", text = element_text(size = 20)) +
  scale_x_discrete(guide = guide_axis(angle = 45))

v2 <- VlnPlot(bmmc_rna_bcell, "Activated1", pt.size = 0) + NoLegend() +
  geom_hline(yintercept = 0) +
  ylab("Module expression") +
  xlab("") +
  ylim(c(-0.15, 0.15)) +
  ggtitle("Activated genes") +
  theme_classic() +
  theme(legend.position = "none", text = element_text(size = 20)) +
  scale_x_discrete(guide = guide_axis(angle = 45))

ggsave(filename = "plots/bmmc/rna_repressed.pdf", plot = v1, height = 4, width = 5)
ggsave(filename = "plots/bmmc/rna_activated.pdf", plot = v2, height = 4, width = 5)

# add DE test between module scores at start vs end of trajectory
module_activated <- bmmc_rna_bcell$Activated1
module_repressed <- bmmc_rna_bcell$Repressed1

progenitor <- colnames(bmmc_rna_bcell)[bmmc_rna_bcell$celltype.l2 %in% c("HSC", "LMPP")]
mature <- colnames(bmmc_rna_bcell)[bmmc_rna_bcell$celltype.l2 %in% c("Memory B", "Plasma")]

t.test(module_activated[progenitor], module_activated[mature]) # <2.2e-16
t.test(module_repressed[progenitor], module_repressed[mature]) # <2.2e-16

########################
# BMMC atac comparison
########################

bmmc_atac <- readRDS("objects/bmmc_atac.rds")

# subset b cell lineage
ct.keep <- c("01_HSC", "06_CLP.1", "15_CLP.2", "05_CMP.LMPP",
             "17_B", "18_Plasma")
b_atac <- bmmc_atac[, bmmc_atac$BioClassification %in% ct.keep]

DefaultAssay(b_atac) <- "ATAC"
b_atac <- FindTopFeatures(b_atac, min.cutoff = 5)
b_atac <- RunTFIDF(b_atac)
b_atac <- RunSVD(b_atac)
b_atac <- RunUMAP(b_atac, reduction = "lsi", dims = 2:20)
b_atac <- FindNeighbors(b_atac, reduction = "lsi", dims = 2:20)
knn_atac <- b_atac[['ATAC_nn']]

# add numeric score for b trajectory
b_atac$trajectory <- as.numeric(factor(b_atac$BioClassification,
                                       levels = c("01_HSC",
                                                  "05_CMP.LMPP",
                                                  "06_CLP.1",
                                                  "15_CLP.2",
                                                  "17_B",
                                                  "18_Plasma")))

b_atac_traj <- b_atac$trajectory

b_atac$b <- factor(b_atac$BioClassification,
                                       levels = c("01_HSC",
                                                  "05_CMP.LMPP",
                                                  "06_CLP.1",
                                                  "15_CLP.2",
                                                  "17_B",
                                                  "18_Plasma"))


Idents(b_atac) <- "b"
levels(bcell) <- c('HSPC', 'GMP/CLP', 'Pre-B', 'B', 'Plasma')

# check dynamic regions in atac dataset
tc_regions_atac <- FeatureMatrix(
  fragments = Fragments(b_atac),
  features = rownames(tc),
  cells = colnames(b_atac)
)
smoothed_atac <- knn_smooth(counts = tc_regions_atac[, colnames(knn_atac)], knn = knn_atac)
smoothed_atac <- t(t(smoothed_atac) / colSums(smoothed_atac))
peak_cor_atac <- cor(t(as.matrix(smoothed_atac)), b_atac_traj)[, 1]

pv_atac <- apply(smoothed_atac, 1, function(x) {
  cor.test(x, b_atac_traj)$p.value
})

nc <- peak_cor_atac[abs(peak_cor_atac) < 0.05]
ns <- p.adjust(pv_atac[names(nc)]) > 0.01

sum(ns)

R ggplot2 Seurat RColorBrewer BSgenome.Hsapiens.UCSC.hg38 Matrix Signac future GenomeInfoDb SeuratWrappers RcppRoll From line 1 of bmmc_dual/trajectory.R

library(Signac)
library(Seurat)

atac <- readRDS("objects/bmmc_atac.rds")
bmmc_dual <- readRDS("objects/bmmc_dual.rds")

bmmc_atac <- atac[, atac$tissue == "BMMC"]

DefaultAssay(bmmc_atac) <- "ATAC"
bmmc_atac <- FindTopFeatures(bmmc_atac)
bmmc_atac <- RunTFIDF(bmmc_atac)
bmmc_atac <- RunSVD(bmmc_atac)
bmmc_atac <- RunUMAP(bmmc_atac, reduction = "lsi", dims = 2:50)

# quantify atac peaks in ntt
counts <- FeatureMatrix(
  fragments = Fragments(bmmc_dual[['ac']])[[1]],
  features = granges(bmmc_atac[['ATAC']]),
  cells = colnames(bmmc_dual)
)

bmmc_dual[['ATAC']] <- CreateChromatinAssay(
  counts = counts, fragments = Fragments(bmmc_dual[['ac']])[[1]]
)

DefaultAssay(bmmc_dual) <- "ATAC"

# normalize with reference IDF
get_idf <- function(x) {
  rsums <- rowSums(x = x)
  idf <- ncol(x = x) / rsums
  idf <- log(1 + idf)
  return(idf)
}

idf <- get_idf(x = GetAssayData(bmmc_atac, assay = "ATAC", slot = "counts"))

bmmc_dual <- RunTFIDF(bmmc_dual, idf = idf)

# find transfer anchors
anchors <- FindTransferAnchors(
  reference = bmmc_atac,
  query = bmmc_dual,
  reduction = "lsiproject",
  dims = 2:30,
  reference.reduction = "lsi",
  reference.assay = "ATAC",
  query.assay = "ATAC"
)

pred <- TransferData(
  anchorset = anchors,
  refdata = bmmc_atac$BioClassification,
  weight.reduction = bmmc_dual[['lsi.me3']],
  dims = 2:50
)

write.table(pred, "datasets/bmmc_dual_predictions.tsv", quote = FALSE, sep = "\t")

R Seurat Signac From line 1 of bmmc_dual/transfer.R

library(Signac)
library(Seurat)

ac <- readRDS("data/ct_pro/H3K27ac.rds")
Fragments(ac) <- NULL
Fragments(ac) <- CreateFragmentObject(
    path = "data/ct_pro/H3K27ac_fragments.tsv.gz",
    cells = Cells(ac)
)
saveRDS(object = ac, file = "data/ct_pro/H3K27ac_updated.rds")

me <- readRDS("data/ct_pro/H3K27me3.rds")
Fragments(me) <- NULL
Fragments(me) <- CreateFragmentObject(
    path = "data/ct_pro/H3K27me3_fragments.tsv.gz",
    cells = Cells(me)
)
saveRDS(object = me, file = "data/ct_pro/H3K27me3_updated.rds")

R Seurat Signac From line 1 of CT_pro/update_objects.R

import pysam
from Bio import SeqIO
import os
import gzip
from pathlib import Path
from argparse import ArgumentParser
import sys

# demux using i5 index only

# parse args
parser = ArgumentParser(description='Read demultiplexer')
parser.add_argument('--read1', help='Path to read1')
parser.add_argument('--read_i5', help='Path to read2')
parser.add_argument('--read2', help='Path to read3')
parser.add_argument('--tn5', help='Path to Tn5 i5 index FASTA file')
parser.add_argument('--output', help='Path to output directory')
args = parser.parse_args()


def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))


def open_fastq(f):
    if os.path.isfile(f):
        if f.endswith(".gz"):
            f_open = gzip.open(f, "rt") # rt read text mode (decodes binary gzip)
        else:
            f_open = open(f, "r")
        return(f_open)
    else:
        raise Exception("File not found")


def read_sample_barcodes(inpath, maxlen=8):
    all_bc = dict()
    for i in SeqIO.parse(open(inpath),'fasta'):
        all_bc[str(i.seq[:maxlen])] = i.id
    return(all_bc)


def get_entry(f):
    return([f.readline(), f.readline(), f.readline(), f.readline()])


def extract_barcodes_simple(sequence, bc1_len=8, spacer_len=14):
    # version with fixed-length barcodes
    sequence = sequence.strip("\n")
    tn5_bc = sequence[:bc1_len]
    cell_bc = sequence[bc1_len+spacer_len:]
    if len(cell_bc) != 16:
        print(cell_bc)
    return((tn5_bc, cell_bc))

tn5_i5_barcodes = read_sample_barcodes(args.tn5, maxlen=8) # using first 8 bases of barcode

r1_read = open_fastq(f=args.read1)
i5_read = open_fastq(f=args.read_i5)
r2_read = open_fastq(f=args.read2)

# create dictionary with file handles
outf = dict()
outpath = Path(args.output)
if not outpath.exists():
    os.mkdir(outpath)

unique_i5_marks = [tn5_i5_barcodes[key] for key in tn5_i5_barcodes.keys()]
unique_i5_marks.append('unknown')
for i5 in unique_i5_marks:
    # outputting uncompressed fastq is ~10x faster
    fname_1 = open(outpath / (i5 + ".R1.fastq"), "w+")
    fname_2 = open(outpath / (i5 + ".R2.fastq"), "w+")
    outf[i5] = (fname_1, fname_2)

x = 0
while True:
    r1_entry = get_entry(r1_read)
    r2_entry = get_entry(r2_read)
    i5_entry = get_entry(i5_read)

    if r1_entry[0] == '':
        break

    # get barcodes
    tn5_barcode, cell_barcode = extract_barcodes_simple(sequence=i5_entry[1], bc1_len=8)

    if tn5_barcode in tn5_i5_barcodes.keys():
        mark = tn5_i5_barcodes[tn5_barcode]
    else:
        # compute hamming distance
        hams = [hamming_distance(tn5_barcode, x) < 3 for x in tn5_i5_barcodes.keys()]
        if sum(hams) == 1:
            mark = tn5_i5_barcodes[list(tn5_i5_barcodes.keys())[hams.index(True)]]
        else:
            mark = "unknown"

    # add barcodes to r1 and r2 genomic
    bc_combination = "@" + cell_barcode + ":" + tn5_barcode + "+"
    r1_entry[0] = bc_combination + r1_entry[0][1:]
    r2_entry[0] = bc_combination + r2_entry[0][1:]

    if mark == "unknown":
        r1_outf = outf['unknown'][0]
        r2_outf = outf['unknown'][1]
    else:
        # write to correct output files based on barcode combination
        r1_outf = outf[mark][0]
        r2_outf = outf[mark][1]

    r1_outf.write("".join(r1_entry))
    r2_outf.write("".join(r2_entry))
    x += 1
    if x % 1e6 == 0:
        print("Processed " + str(int(x/1e6)) + " million reads", file=sys.stderr, end="\r")

# close all files
for i in outf.keys():
    outf[i][0].close()
    outf[i][1].close()

Python Biopython pysam From line 1 of code/demux.py

library(GenomicRanges)

me3_peaks <- c("ENCFF138FVQ", "ENCFF720AKK",
               "ENCFF407VKP","ENCFF588JEN",
               "ENCFF570DGJ", "ENCFF071JFL",
               "ENCFF276IMO")

ac_peaks <- c("ENCFF240LSH", "ENCFF614FJO",
              "ENCFF449ITG", "ENCFF321NYQ",
              "ENCFF544QYY", "ENCFF059WXH",
              "ENCFF412NPA")

bulk_me3 <- "data/encode/ENCFF291LVP.bed.gz"
bulk_ac <- "data/encode/ENCFF832RWT.bed.gz"

me3_bedfiles <- paste0("data/encode/", me3_peaks, ".bed.gz")
ac_bedfiles <- paste0("data/encode/", ac_peaks, ".bed.gz")

read_bed <- function(filepath) {
    df <- read.table(file = filepath,
                     col.names = c("chr", "start", "end",
                                   "name", "score", "strand",
                                   "fold_change", "neg_log10pvalue_summit",
                                   "neg_log10qvalue_summit", "relative_summit_position"))
    gr <- makeGRangesFromDataFrame(df = df, keep.extra.columns = TRUE, 
        starts.in.df.are.0based = TRUE)
    return(gr)
}

me3_ranges <- sapply(me3_bedfiles, read_bed)
ac_ranges <- sapply(ac_bedfiles, read_bed)

all_me3 <- Reduce(f = c, x = me3_ranges)
me3 <- reduce(all_me3)
me3 <- keepStandardChromosomes(me3, pruning.mode = "coarse")
me3 <- me3[width(me3) < 20000]

all_ac <- Reduce(f = c, x = ac_ranges)
ac <- reduce(all_ac)
ac <- keepStandardChromosomes(ac, pruning.mode = "coarse")
ac <- ac[width(ac) < 20000]

combined <- c(ac, me3)

ac <- as.data.frame(ac)
write.table(x = ac, file = "data/encode/h3k27ac.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
me3 <- as.data.frame(me3)
write.table(x = me3, file = "data/encode/h3k27me3.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
combined <- as.data.frame(combined)
write.table(x = combined, file = "data/encode/all.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

# bulk peaks
ac_bulk <- read_bed(bulk_ac)
me3_bulk <- read_bed(bulk_me3)
all_bulk <- c(ac_bulk, me3_bulk)
all_bulk <- as.data.frame(all_bulk)
write.table(x = all_bulk, file = "data/encode/all_bulk.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

R GenomicRanges From line 1 of encode/combine_peaks.R

library(Signac)
library(ggplot2)

bigWigs <- list(
  "RNAPII (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-Pol2.bw",
  "H3K27me3 (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-K27me.bw",
  "H3K27ac (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-K27ac.bw",
  "RNAPII (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bw",
  "H3K27me3 (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-K27me.bw",
  "H3K27ac (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bw"
)

bw <- BigwigTrack(
  region = "chr11-69900000-70910000",
  bigwig = bigWigs,
  y_label = "Normalized coverage",
  ymax = "q90",
  bigwig.scale = "common",
  smooth = 1000
) + theme(legend.position = "none")

bw <- bw + scale_fill_manual(values = c("#036C9A", "#D3145A", "#F98401",
                                        "#676767", "#676767", "#676767"))

ggsave(
  filename = "plots/hek_k562_bulk/bulk_covplot_k562.png",
  plot = bw,
  height = 5,
  width = 10,
  units = "in"
)

R ggplot2 Signac From line 1 of HEK_K562_bulk/analysis.R

library(GenomicRanges)

regions_s5 <- read.table("data/k562_peaks/ENCFF053XYZ.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s5 <- makeGRangesFromDataFrame(regions_s5, keep.extra.columns = TRUE)

regions_s2 <- read.table("data/k562_peaks/ENCFF266OPF.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s2 <- makeGRangesFromDataFrame(regions_s2, keep.extra.columns = TRUE)

regions_rna <- reduce(c(regions_s2, regions_s5))

regions_rna <- as.data.frame(regions_rna)
write.table(regions_rna, file = "data/k562_peaks/rna.bed", col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")

R GenomicRanges From line 1 of HEK_K562_bulk/collect_peaks.R

library(Signac)
library(Seurat)
library(ggplot2)
library(patchwork)
library(GenomeInfoDb)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)
library(Rsamtools)

obj <- readRDS("objects/hek_k562.rds")

# dimplot for figure 1
p_ac <- DimPlot(obj, reduction = "umap.k27ac", pt.size = 0.1) + ggtitle("H3K27ac") + theme_void() + theme(legend.position = "none")
p_me <- DimPlot(obj, reduction = "umap.k27me", pt.size = 0.1) + ggtitle("H3K2me3") + theme_void() + theme(legend.position = "none")
p_pol <- DimPlot(obj, reduction = "umap.pol2", pt.size = 0.1) + ggtitle("RNAPII") + theme_void() + theme(legend.position = "none")
p_wnn <- DimPlot(obj, reduction = "wnn.umap") + ggtitle("H3K27me3 + H3K27ac + RNAPII") + theme_classic() + xlab("UMAP 1") + ylab("UMAP 2") + theme(legend.position = "none")

p1 <- ((p_ac / p_me / p_pol) | p_wnn) + plot_layout(widths = c(1, 2))

ggsave("plots/hek_k562/dimplot_hek_k562.pdf", plot = p1, height = 5, width = 8)

# bulk data for comparison
hek_mono_k27ac <- "data/HEK_K562_bulk/mapped/HEK-mono-K27ac.bw"
hek_mono_k27me3 <- "data/HEK_K562_bulk/mapped/HEK-mono-K27me.bw"
hek_mono_s2s5p <- "data/HEK_K562_bulk/mapped/HEK-mono-Pol2.bw"
k562_mono_k27ac <- "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bw"
k562_mono_k27me3 <- "data/HEK_K562_bulk/mapped/K562-mono-K27me.bw"
k562_mono_s2s5p <- "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bw"

DefaultAssay(obj) <- "K27ac"

p <- CoveragePlot(
  object = obj,
  region = "SCMH1",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 20000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
    filename = "plots/SCMH1_k562.png",
    plot = p,
    dpi = 600,
    units = "in",
    height = 5,
    width = 15
)

# additional regions for supplement
p <- CoveragePlot(
  object = obj,
  region = "NASP",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 20000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
  filename = "plots/NASP_k562.png",
  plot = p,
  dpi = 600,
  units = "in",
  height = 5,
  width = 15
)

p <- CoveragePlot(
  object = obj,
  region = "FAF1",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 50000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
  filename = "plots/FAF1_k562.png",
  plot = p,
  dpi = 600,
  units = "in",
  height = 5,
  width = 15
)

p <- CoveragePlot(
  object = obj,
  region = "SDAD1",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 100000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
  filename = "plots/SDAD1_k562.png",
  plot = p,
  dpi = 600,
  units = "in",
  height = 5,
  width = 15
)

# multiCUT&Tag comparison
# same
me3_me3 <- "data/multict/fragments/H3K27me3-H3K27me3.tsv.gz"
ac_ac <- "data/multict/fragments/H3K27ac-H3K27ac.tsv.gz"

# mixed
ac_me3 <- "data/multict/fragments/H3K27ac-H3K27me3.tsv.gz"
me3_ac <- "data/multict/fragments/H3K27me3-H3K27ac.tsv.gz"

totals_mct <- CountFragments(
  fragments = list(me3_me3, ac_ac, ac_me3, me3_ac)
)

cells_keep_mct <- totals_mct[totals_mct$frequency_count > 200, "CB"]

# count fragments for each mark
total_me3_mct <- CountFragments(fragments = me3_me3, cells = cells_keep_mct)
total_ac_mct <- CountFragments(fragments = ac_ac, cells = cells_keep_mct)

# count mixed 1/2 for each
total_me3_mixed_mct <- CountFragments(fragments = me3_ac, cells = cells_keep_mct)
total_ac_mixed_mct <- CountFragments(fragments = ac_me3, cells = cells_keep_mct)
total_me3_mixed_mct$frequency_count <- total_me3_mixed_mct$frequency_count / 2
total_ac_mixed_mct$frequency_count <- total_ac_mixed_mct$frequency_count / 2

mct_counts_me3 <- total_me3_mct$frequency_count
names(mct_counts_me3) <- total_me3_mct$CB
mct_counts_me3[total_me3_mixed_mct$CB] <- mct_counts_me3[total_me3_mixed_mct$CB] + total_me3_mixed_mct$frequency_count
mct_counts_me3[total_ac_mixed_mct$CB] <- mct_counts_me3[total_ac_mixed_mct$CB] + total_ac_mixed_mct$frequency_count

mct_counts_ac <- total_ac_mct$frequency_count
names(mct_counts_ac) <- total_ac_mct$CB
mct_counts_ac[total_ac_mixed_mct$CB] <- mct_counts_ac[total_ac_mixed_mct$CB] + total_ac_mixed_mct$frequency_count
mct_counts_ac[total_me3_mixed_mct$CB] <- mct_counts_ac[total_me3_mixed_mct$CB] + total_me3_mixed_mct$frequency_count

# same for reads
mct_reads_me3 <- total_me3_mct$reads_count
names(mct_reads_me3) <- total_me3_mct$CB
mct_reads_me3[total_me3_mixed_mct$CB] <- mct_reads_me3[total_me3_mixed_mct$CB] + total_me3_mixed_mct$reads_count
mct_reads_me3[total_ac_mixed_mct$CB] <- mct_reads_me3[total_ac_mixed_mct$CB] + total_ac_mixed_mct$reads_count

mct_reads_ac <- total_ac_mct$reads_count
names(mct_reads_ac) <- total_ac_mct$CB
mct_reads_ac[total_ac_mixed_mct$CB] <- mct_reads_ac[total_ac_mixed_mct$CB] + total_ac_mixed_mct$reads_count
mct_reads_ac[total_me3_mixed_mct$CB] <- mct_reads_ac[total_me3_mixed_mct$CB] + total_me3_mixed_mct$reads_count

count_df <- data.frame(count = c(mct_counts_me3, mct_counts_ac),
                       mark = c(rep("H3K27me3", length(mct_counts_me3)),
                                rep("H3K27ac", length(mct_counts_ac))),
                       dataset = "multiCUT&Tag",
                       reads = c(mct_reads_me3, mct_reads_ac)
                       )

# ntt
total_me3 <- CountFragments(
    fragments = "data/HEK_K562_sc/sinto/K27me.bed.gz",
    cells = colnames(obj)
)
total_ac <- CountFragments(
    fragments = "data/HEK_K562_sc/sinto/K27ac.bed.gz",
    cells = colnames(obj)
)
total_pol <- CountFragments(
    fragments = "data/HEK_K562_sc/sinto/Pol2.bed.gz",
    cells = colnames(obj)
)

# construct total vectors
me3 <- total_me3$frequency_count
names(me3) <- total_me3$CB
df1 <- data.frame(
    count = me3,
    mark = "H3K27me3",
    dataset = "scNTT-seq"
)
df1$reads <- total_me3$reads_count

ac <- total_ac$frequency_count
names(ac) <- total_ac$CB
df2 <- data.frame(
    count = ac,
    mark = "H3K27ac",
    dataset = "scNTT-seq"
)
df2$reads <- total_ac$reads_count

pol <- total_pol$frequency_count
names(pol) <- total_pol$CB
df3 <- data.frame(
    count = pol,
    mark = "RNAPII",
    dataset = "scNTT-seq"
)
df3$reads <- total_pol$reads_count

count_df <- rbind(count_df, df1, df2, df3)
count_df <- rbind(count_df, data.frame(count=c(1,1), mark="RNAPII", dataset="multiCUT&Tag", reads=c(1,1)))

# total fragments, split by mark
p3 <- ggplot(data = count_df, mapping = aes(mark, reads, fill=dataset)) +
  geom_boxplot(outlier.size = 0.1) +
  scale_y_log10() +
  theme_bw() +
  ylab("Total reads per cell") +
  xlab("Antibody") +
  theme(legend.position = "none") +
  ggtitle("Reads per cell")

p4 <- ggplot(data = count_df, mapping = aes(mark, count, fill=dataset)) +
  geom_boxplot(outlier.size = 0.1) +
  scale_y_log10() +
  theme_bw() +
  ylab("Total fragments per cell") +
  xlab("Antibody") +
  ggtitle("Fragments per cell")

ggsave(
    filename = "plots/hek_k562/sensitivity.png",
    plot = (p3 | p4),
    height = 4,
    width = 7,
    dpi = 500
)

# Region heatmaps
atac_peaks <- rtracklayer::import("data/K562_ATAC/ENCFF006OFA.bigBed")
atac_peaks <- reduce(atac_peaks)
chr.keep <- c(paste0("chr", 1:22), "chrX")
atac_peaks <- keepSeqlevels(atac_peaks, value = chr.keep, pruning.mode = "coarse")
atac_peaks <- subsetByOverlaps(atac_peaks, ranges = blacklist_hg38_unified, invert = TRUE)

regions_me3 <- read.table("data/k562_peaks/ENCFF031FSF.bed.gz", sep = "\t",
                          col.names = c("chromosome", "start", "end", "name", "score",
                                        "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
regions_me3$peak <- "H3K27me3"

regions_ac <- read.table("data/k562_peaks/ENCFF038DDS.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
regions_ac$peak <- "H3K27ac"

regions_s5 <- read.table("data/k562_peaks/ENCFF053XYZ.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s5 <- makeGRangesFromDataFrame(regions_s5, keep.extra.columns = TRUE)

regions_s2 <- read.table("data/k562_peaks/ENCFF266OPF.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s2 <- makeGRangesFromDataFrame(regions_s2, keep.extra.columns = TRUE)

regions_rna <- reduce(c(regions_s2, regions_s5))
regions_rna$peak <- "RNAPII"

regions <- c(regions_me3, regions_ac, regions_rna)
regions <- keepStandardChromosomes(regions, pruning.mode = "coarse")
regions <- dropSeqlevels(regions, value = "chrM", pruning.mode = "coarse")
regions <- dropSeqlevels(regions, value = "chrY", pruning.mode = "coarse")

atac_chr1 <- atac_peaks[seqnames(atac_peaks) == 'chr1']
me3_peaks <- regions_me3[seqnames(regions_me3) == 'chr1']
ac_peaks <- regions_ac[seqnames(regions_ac) == 'chr1']
pol2_peaks <- regions_rna[seqnames(regions_rna) == 'chr1']

# ATAC
obj <- RegionMatrix(
    object = obj,
    regions = atac_chr1,
    key = "ATAC",
    assay = "K27ac"
)
obj <- RegionMatrix(
    object = obj,
    regions = atac_chr1,
    key = "ATAC",
    assay = "K27me"
)
obj <- RegionMatrix(
    object = obj,
    regions = atac_chr1,
    key = "ATAC",
    assay = "Pol2"
)

# me3
obj <- RegionMatrix(
  object = obj,
  regions = me3_peaks,
  key = "me3",
  assay = "K27ac"
)
obj <- RegionMatrix(
  object = obj,
  regions = me3_peaks,
  key = "me3",
  assay = "K27me"
)
obj <- RegionMatrix(
  object = obj,
  regions = me3_peaks,
  key = "me3",
  assay = "Pol2"
)

# ac
obj <- RegionMatrix(
  object = obj,
  regions = ac_peaks,
  key = "ac",
  assay = "K27ac"
)
obj <- RegionMatrix(
  object = obj,
  regions = ac_peaks,
  key = "ac",
  assay = "K27me"
)
obj <- RegionMatrix(
  object = obj,
  regions = ac_peaks,
  key = "ac",
  assay = "Pol2"
)

# pol2
obj <- RegionMatrix(
  object = obj,
  regions = pol2_peaks,
  key = "pol2",
  assay = "K27ac"
)
obj <- RegionMatrix(
  object = obj,
  regions = pol2_peaks,
  key = "pol2",
  assay = "K27me"
)
obj <- RegionMatrix(
  object = obj,
  regions = pol2_peaks,
  key = "pol2",
  assay = "Pol2"
)

# plots
cm <- list("K27ac" = "#F98401", "K27me" = "#D3145A", "Pol2" = "#036C9A")
p4 <- RegionHeatmap(
  object = obj,
  assay = c("K27ac", "K27me", "Pol2"),
  cols = cm,
  key = "ATAC",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("ATAC peaks")

p5 <- RegionHeatmap(
  object = obj,
  assay = c("K27me", "K27ac", "Pol2"),
  cols = cm,
  key = "me3",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("H3K27me3 peaks")

p6 <- RegionHeatmap(
  object = obj,
  assay = c("K27ac", "K27me", "Pol2"),
  cols = cm,
  key = "ac",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("H3K27ac peaks")

p7 <- RegionHeatmap(
  object = obj,
  assay = c("Pol2", "K27ac", "K27me"),
  cols = cm,
  key = "pol2",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("RNAPII peaks")

update_plot <- function(p, y) {
  p[[1]] <- p[[1]] + ylab(y)
  p[[2]] <- p[[2]] + theme(axis.title.y=element_blank(),
                           axis.text.y=element_blank())
  p[[3]] <- p[[3]] + theme(axis.title.y=element_blank(),
                           axis.text.y=element_blank())
  return(p)
}

p4 <- update_plot(p4, y = "ATAC peaks")
p5 <- update_plot(p5, y = "H3K27me3 peaks")
p6 <- update_plot(p6, y = "H3K27ac peaks")
p7 <- update_plot(p7, y = "RNAPII peaks")

pp <- (p4 | p5 | p6 | p7) & theme(legend.position = "none")
ggsave(filename = "plots/hek_k562/region_heatmaps.png", plot = pp, height = 6, width = 12)

# genome-wide correlation plot

# bulk fragfiles
k562_mono_k27ac_frag <- "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bed.gz"
k562_mono_k27me3_frag <- "data/HEK_K562_bulk/mapped/K562-mono-K27me.bed.gz"
k562_mono_s2s5p_frag <- "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bed.gz"

# split single-cell
sc_k27_ac <- "data/HEK_K562_sc/split/K27ac_k562_filter.bed.gz"
sc_k27_me <- "data/HEK_K562_sc/split/K27me_k562_filter.bed.gz"
sc_pol <- "data/HEK_K562_sc/split/Pol2_k562_filter.bed.gz"

QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # get total fragments for normalization
  # just read whole thing
  f <- read.table(fragfile, sep = "\t", header = FALSE)
  total <- nrow(f)
  rm(f)
  gc()

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / total) * 10^6
  return(cmp)
}


# need bulk vs bulk
# bulk vs single cell
# single cell vs single cell

k27ac_cov <- QuantifyRegions(k562_mono_k27ac_frag, regions)
k27me3_cov <- QuantifyRegions(k562_mono_k27me3_frag, regions)
pol2_cov <- QuantifyRegions(k562_mono_s2s5p_frag, regions)

pol2_sc_cov <- QuantifyRegions(sc_pol, regions)
ac_sc_cov <- QuantifyRegions(sc_k27_ac, regions)
me_sc_cov <- QuantifyRegions(sc_k27_me, regions)

# create matrix
mat <- cbind(k27ac_cov, ac_sc_cov, k27me3_cov, me_sc_cov, pol2_cov, pol2_sc_cov)

correlation_matrix <- cor(mat)
colnames(correlation_matrix) <- c("Bulk H3K27ac", "sc H3K27ac",
                                  "Bulk H3K27me3", "sc H3K27me3",
                                  "Bulk RNAPII", "sc RNAPII")
rownames(correlation_matrix) <- colnames(correlation_matrix)
df <- as.data.frame(as.table(correlation_matrix))

p8 <- ggplot(df, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  scale_fill_distiller(palette = "RdYlBu") +
  xlab("") + ylab("") +
  guides(fill = guide_legend(title = "Pearson\ncorrelation")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggsave(filename = "plots/hek_k562/correlation.png", plot = p8, height = 5, width = 6)

# scatterplots

# 1. use encode peak regions for each mark
# 2. plot one mark vs another
# 3. fit linear model

get_cod_expression <- function(m) {
  eq <- substitute(~~italic(R)^2~"="~r2, 
                   list(r2 = format(summary(m)$r.squared, digits = 2)))
  cod <- as.expression(eq)
  return(cod)
}

mat <- as.data.frame(mat)
mat$region <- regions$peak
colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

# bulk-bulk
m.use <- mat[mat$region %in% c("H3K27me3", "H3K27ac"), ]
bulk_ac_me3 <- ggplot(m.use, aes(k27ac_cov, k27me3_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("H3K27me3 (bulk, single antibody)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ k27me3_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("RNAPII", "H3K27ac"), ]
bulk_ac_pol <- ggplot(m.use,
                      aes(k27ac_cov, pol2_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 300)) + ylim(c(0, 300)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("RNAPII (bulk, single antibody)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ pol2_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("RNAPII", "H3K27me3"), ]
bulk_pol_me <- ggplot(m.use,
                      aes(pol2_cov, k27me3_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("RNAPII (bulk, single antibody)") +
  ylab("H3K27me3 (bulk, single antibody)") +
  ggtitle(get_cod_expression(lm(pol2_cov ~ k27me3_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# bulk-sc
m.use <- mat[mat$region %in% c("H3K27ac", "H3K27me3"), ]
bulk_ac_sc_me3 <- ggplot(m.use,
                         aes(k27ac_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 800)) + ylim(c(0, 800)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27ac", "RNAPII"), ]
bulk_ac_sc_pol <- ggplot(m.use,
                         aes(k27ac_cov, pol2_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("RNAPII (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ pol2_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27me3", "RNAPII"), ]
bulk_pol_sc_me <- ggplot(m.use,
                         aes(pol2_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("RNAPII (bulk, single antibody)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(pol2_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27ac"), ]
bulk_ac_sc_ac <- ggplot(m.use,
                        aes(k27ac_cov, ac_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  scale_color_manual(values = colormap) +
  xlim(c(0, 600)) + ylim(c(0, 600)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("H3K27ac (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ ac_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27me3"), ]
bulk_me_sc_me <- ggplot(m.use,
                        aes(k27me3_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 300)) + ylim(c(0, 300)) +
  xlab("H3K27me3 (bulk, single antibody)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27me3_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("RNAPII"), ]
bulk_pol_sc_pol <- ggplot(m.use,
                          aes(pol2_cov, pol2_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  scale_color_manual(values = colormap) +
  xlim(c(0, 200)) + ylim(c(0, 200)) +
  xlab("RNAPII (bulk, single antibody)") +
  ylab("RNAPII (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(pol2_cov ~ pol2_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# sc-sc
m.use <- mat[mat$region %in% c("H3K27ac", "H3K27me3"), ]
sc_ac_sc_me3 <- ggplot(m.use,
                       aes(ac_sc_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single-cell, multiplexed)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27ac", "RNAPII"), ]
sc_ac_sc_pol <- ggplot(m.use,
                       aes(ac_sc_cov, pol2_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single-cell, multiplexed)") +
  ylab("RNAPII (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ pol2_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27me3", "RNAPII"), ]
sc_pol_sc_me <- ggplot(m.use,
                       aes(pol2_sc_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 200)) + ylim(c(0, 200)) +
  xlab("RNAPII (single-cell, multiplexed)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(pol2_sc_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

pp <- (bulk_ac_me3 / bulk_ac_pol / bulk_pol_me) | (bulk_ac_sc_me3 / bulk_ac_sc_pol / bulk_pol_sc_me) | (sc_ac_sc_me3 / sc_ac_sc_pol / sc_pol_sc_me) | (bulk_me_sc_me / bulk_ac_sc_ac / bulk_pol_sc_pol)
pp <- pp & NoLegend()
ggsave(filename = "plots/hek_k562/scatterplots.png", plot = pp, height = 10, width = 12)

# ternary plot for single-cell data
if (!requireNamespace(package = "ggtern", quietly = TRUE)) {
  install.packages("ggtern")
}
library(ggtern)

region_counts <- mat[, c("region", "ac_sc_cov", "me_sc_cov", "pol2_sc_cov")]
colnames(region_counts) <- c("region", "H3K27ac", "H3K27me3", "RNAPII")

# randomize plotting order
region_counts <- region_counts[sample(1:nrow(region_counts)), ]

gp <- ggtern(region_counts, aes(x = H3K27ac, y = H3K27me3, z = RNAPII, color = region)) +
  geom_point(size = 0.1, alpha = 0.2) +
  scale_color_manual(values = colors.use) +
  theme_classic()

ggsave(filename = "plots/hek_k562/ternary.png", plot = gp, height = 8, width = 8)

## KNN purity ## 
if (!requireNamespace(package = "RANN", quietly = TRUE)) {
  install.packages("RANN")
}
library(RANN)

knn_purity_emb <- function(embeddings, dims, clusters, k) {
  nn <- nn2(data = embeddings[, dims], k = k + 1)$nn.idx[, 2:k]
  nn_purity <- vector(mode = "numeric", length = length(x = clusters))
  for (i in seq_len(length.out = nrow(x = nn))) {
    nn_purity[i] <- sum(clusters[nn[i, ]] == clusters[i]) / k
  }
  return(nn_purity)
}

knn_purity_graph <- function(graph, clusters) {
  # pre-existing graph, can't change k
  nn_purity <- vector(mode = "numeric", length = length(x = clusters))
  for (i in seq_len(length.out = nrow(x = graph))) {
    nbr <- names(which(graph[i, ] == 1))
    nn_purity[i] <- sum(clusters[nbr] == clusters[i]) / sum(graph[i, ])
  }
  return(nn_purity)
}

# compute wnn for each combination of modalities
obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_all",
  reduction.list = list("lsi.k27ac", "lsi.k27me", "lsi.pol2"), 
  dims.list = list(2:10, 2:10, 2:10)
)

obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_ac_me3",
  reduction.list = list("lsi.k27ac", "lsi.k27me"), 
  dims.list = list(2:10, 2:10)
)

obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_ac_pol2",
  reduction.list = list("lsi.k27ac", "lsi.pol2"), 
  dims.list = list(2:10, 2:10)
)

obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_me3_pol2",
  reduction.list = list("lsi.k27me", "lsi.pol2"), 
  dims.list = list(2:10, 2:10)
)

dims <- 2:10
k <- 50
emb_me3 <- Embeddings(obj[['lsi.k27me']])
emb_ac <- Embeddings(obj[['lsi.k27ac']])
emb_pol <- Embeddings(obj[['lsi.pol2']])

ct <- obj$celltype

kme3 <- data.frame(
  purity = knn_purity_emb(embeddings = emb_me3, dims = dims, clusters = ct, k = k),
  assay = "H3K27me3",
  celltype = ct
)
kac <- data.frame(
  purity = knn_purity_emb(embeddings = emb_ac, dims = dims, clusters = ct, k = k),
  assay = "H3K27ac",
  celltype = ct
)
kpol <- data.frame(
  purity = knn_purity_emb(embeddings = emb_pol, dims = dims, clusters = ct, k = k),
  assay = "RNAPII",
  celltype = ct
)

kme3_ac <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_ac_me3']], clusters = ct),
  assay = "H3K27me3 + H3K27ac",
  celltype = ct
)
kac_pol <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_ac_pol2']], clusters = ct),
  assay = "H3K27ac + RNAPII",
  celltype = ct
)
kpol_me <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_me3_pol2']], clusters = ct),
  assay = "H3K27me3 + RNAPII",
  celltype = ct
)
kpol_me_ac <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_all']], clusters = ct),
  assay = "H3K27me3 + H3K27ac + RNAPII",
  celltype = ct
)

kdf <- rbind(kme3, kac, kpol, kme3_ac, kac_pol, kpol_me, kpol_me_ac)
kdf$assay <- factor(kdf$assay, levels = c("RNAPII", "H3K27ac", "H3K27me3",
                                             "H3K27ac + RNAPII", "H3K27me3 + RNAPII", "H3K27me3 + H3K27ac",
                                             "H3K27me3 + H3K27ac + RNAPII"))

knn_p <- ggplot(kdf, aes(y = assay, x = purity, fill = assay)) +
  ggridges::geom_density_ridges(bandwidth = 0.008) +
  xlab("Fraction of neighbors of same cell type") + ylab("Assay") +
  theme_classic() +
  scale_fill_manual(values = c("#036C9A", "#F98401", "#D3145A", "#7e784e", "#6b407a", "#e64c2e", "#9a5752")) +
  xlim(c(0.8, 1.01)) +
  theme(legend.position = "none", text = element_text(size = 12)) + 
  geom_vline(xintercept = 1)

ggsave(filename = "plots/hek_k562/knn_purity_wnn.png", plot = knn_p, height = 4, width = 7)

R ggplot2 tabix Seurat BSgenome.Hsapiens.UCSC.hg38 GenomicRanges patcHwork Signac Rsamtools GenomeInfoDb ggtern RANN From line 1 of HEK_K562_sc/analysis.R

library(Signac)
library(ggplot2)
library(patchwork)
library(Rsamtools)


QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # get total fragments for normalization
  # just read whole thing
  f <- read.table(fragfile, sep = "\t", header = FALSE)
  total <- nrow(f)
  rm(f)
  gc()

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / total) * 10^6
  return(cmp)
}

get_cod_expression <- function(m) {
  eq <- substitute(~~italic(R)^2~"="~r2, 
                   list(r2 = format(summary(m)$r.squared, digits = 2)))
  cod <- as.expression(eq)
  return(cod)
}

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401")

# multiCUT&Tag
# same
me3_me3 <- "data/multict/fragments/H3K27me3-H3K27me3.tsv.gz"
ac_ac <- "data/multict/fragments/H3K27ac-H3K27ac.tsv.gz"

# mixed
ac_me3 <- "data/multict/fragments/H3K27ac-H3K27me3.tsv.gz"
me3_ac <- "data/multict/fragments/H3K27me3-H3K27ac.tsv.gz"

# peaks
h3k27me3 <- read.table("data/mesc/ENCFF008XKX.bed.gz", sep = "\t") # H3K27me3 ES-Bruce4 mm10
h3k27ac <- read.table("data/mesc/ENCFF360VIS.bed.gz", sep = "\t") # H3K27ac ES-Bruce4 mm10

h3k27me3 <- makeGRangesFromDataFrame(
  df = h3k27me3, seqnames.field = "V1", start.field = "V2", end.field = "V3"
)
h3k27ac <- makeGRangesFromDataFrame(
  df = h3k27ac, seqnames.field = "V1", start.field = "V2", end.field = "V3"
)

h3k27me3 <- keepStandardChromosomes(h3k27me3, pruning.mode = "coarse")
h3k27ac <- keepStandardChromosomes(h3k27ac, pruning.mode = "coarse")

totals_mct <- CountFragments(
  fragments = list(me3_me3, ac_ac, ac_me3, me3_ac)
)

cells_keep_mct <- totals_mct[totals_mct$frequency_count > 200, "CB"]

me3_me3_filt <- "data/multict/fragments/H3K27me3-H3K27me3_filtered.tsv.gz"
ac_ac_filt <- "data/multict/fragments/H3K27ac-H3K27ac_filtered.tsv.gz"
ac_me3_filt <- "data/multict/fragments/H3K27ac-H3K27me3_filtered.tsv.gz"
me3_ac_filt <- "data/multict/fragments/H3K27me3-H3K27ac_filtered.tsv.gz"

# filter cells
FilterCells(
  fragments = me3_me3,
  cells = cells_keep_mct,
  outfile = me3_me3_filt
)
FilterCells(
  fragments = ac_ac,
  cells = cells_keep_mct,
  outfile = ac_ac_filt
)
FilterCells(
  fragments = ac_me3,
  cells = cells_keep_mct,
  outfile = ac_me3_filt
)
FilterCells(
  fragments = me3_ac,
  cells = cells_keep_mct,
  outfile = me3_ac_filt
)

h3k27me3$peak <- "H3K27me3"
h3k27ac$peak <- "H3K27ac"
regions <- c(h3k27me3, h3k27ac)

###########
# scatter #
###########

# quantify regions
me3_me3 <- QuantifyRegions(me3_me3_filt, regions)
me3_ac <- QuantifyRegions(me3_ac_filt, regions)
ac_ac <- QuantifyRegions(ac_ac_filt, regions)
ac_me3 <- QuantifyRegions(ac_me3_filt, regions)

mat <- cbind(me3_me3, me3_ac, ac_ac, ac_me3)
mat <- as.data.frame(mat)
mat$region <- regions$peak

mat$me3 <- mat$me3_me3 + (mat$me3_ac/2) + (mat$ac_me3/2)
mat$ac <- mat$ac_ac + (mat$me3_ac/2) + (mat$ac_me3/2)

p_me3_ac <- ggplot(mat, aes(x = ac, y = me3, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  theme(legend.position = 'none') +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  ylab("H3K27me3") +
  xlab("H3K27ac") +
  ggtitle(get_cod_expression(lm(me3 ~ ac, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ggsave(filename = "plots/hek_k562/multi_cuttag_scatter.png", plot = p_me3_ac, height = 4, width = 4)

############
### FRiP ###
############

get_total <- function(frags, cells) {
  tot <- CountFragments(fragments = frags@path, cells = cells)
  totals <- tot$frequency_count
  names(totals) <- tot$CB
  return(totals)
}

get_in_peaks <- function(obj, regions) {
  counts <- FeatureMatrix(
    fragments = obj,
    features = regions,
    cells = obj@cells
  )
  return(colSums(counts))
}


me3_me3_obj <- CreateFragmentObject(me3_me3_filt, cells = cells_keep_mct)
ac_ac_obj <- CreateFragmentObject(ac_ac_filt, cells = cells_keep_mct)
me3_ac_obj <- CreateFragmentObject(me3_ac_filt, cells = cells_keep_mct)
ac_me3_obj <- CreateFragmentObject(ac_me3_filt, cells = cells_keep_mct)

totals_me3_me3 <- get_total(me3_me3_obj, cells = cells_keep_mct)
totals_ac_me3 <- get_total(ac_me3_obj, cells = cells_keep_mct)
totals_me3_ac <- get_total(me3_ac_obj, cells = cells_keep_mct)
totals_ac_ac <- get_total(ac_ac_obj, cells = cells_keep_mct)

in_me3_peak_me3_me3 <- get_in_peaks(obj = me3_me3_obj, regions = h3k27me3)
in_me3_peak_ac_ac <- get_in_peaks(obj = ac_ac_obj, regions = h3k27me3)
in_me3_peak_me3_ac <- get_in_peaks(obj = me3_ac_obj, regions = h3k27me3)
in_me3_peak_ac_me3 <- get_in_peaks(obj = ac_me3_obj, regions = h3k27me3)

in_ac_peak_me3_me3 <- get_in_peaks(obj = me3_me3_obj, regions = h3k27ac)
in_ac_peak_ac_ac <- get_in_peaks(obj = ac_ac_obj, regions = h3k27ac)
in_ac_peak_me3_ac <- get_in_peaks(obj = me3_ac_obj, regions = h3k27ac)
in_ac_peak_ac_me3 <- get_in_peaks(obj = ac_me3_obj, regions = h3k27ac)

me3_frip <- data.frame(
  cell = names(totals_me3_me3),
  total = totals_me3_me3 + (totals_ac_me3/2) + (totals_me3_ac/2),
  in_peak = in_me3_peak_me3_me3 + (in_me3_peak_me3_ac/2) + (in_me3_peak_ac_me3/2),
  assay = "H3K27me3",
  peak = "H3K27me3"
)
me3_frip$FRiP <- me3_frip$in_peak / me3_frip$total

ac_frip <- data.frame(
  cell = names(totals_ac_ac),
  total = totals_ac_ac + (totals_ac_me3/2) + (totals_me3_ac/2),
  in_peak = in_ac_peak_ac_ac + (in_ac_peak_me3_ac/2) + (in_ac_peak_ac_me3/2),
  assay = "H3K27ac",
  peak = "H3K27ac"
)
ac_frip$FRiP <- ac_frip$in_peak / ac_frip$total

df <- rbind(me3_frip, ac_frip)

p <- ggplot(df, aes(x = assay, y = FRiP, fill = assay)) +
  geom_boxplot(outlier.size = 0.5) +
  scale_fill_manual(values = colormap) +
  xlab("Antibody") +
  ggtitle("multiCUT&Tag FRiP") +
  theme_bw() +
  theme(legend.position = "none")

ggsave(filename = "plots/hek_k562/multi_cuttag_frip.png", plot = p, height = 4, width = 3)

saveRDS(object = df, file = "objects/frip_mct.rds")

R ggplot2 tabix patcHwork Signac Rsamtools From line 1 of HEK_K562_sc/mct_scatter.R

library(Seurat)
library(Signac)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(future)
library(ggplot2)
library(patchwork)

args <- commandArgs(trailingOnly = TRUE)
threads <- args[[1]]
outfile <- args[[2]]
fpath <- args[[3]]

plan("multicore", workers = as.integer(threads))
options(future.globals.maxSize = Inf)

# annotation
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"
genome.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:23]

fragfiles <- list.files(
  path = fpath,
  pattern = "*.bed.gz$",
  full.names = TRUE
)
fname <- gsub(pattern = paste0(fpath, "/"),
              replacement = "",
              x = fragfiles)
fname <- gsub(pattern = ".bed.gz",
              replacement = "",
              x = fname)
names(fragfiles) <- fname

total_counts <- CountFragments(fragments = as.list(fragfiles))
cells.keep <- total_counts[total_counts$frequency_count > 150, "CB"]

create_assay <- function(frag, cells, genome.use) {
  frag_obj <- CreateFragmentObject(
    path = frag,
    cells = cells
  )
  counts <- AggregateTiles(
    object = frag_obj,
    cells = cells,
    min_counts = 1,
    binsize = 10000,
    genome = genome.use
  )
  return(counts)
}

all.counts <- lapply(
  X = fragfiles,
  FUN = create_assay,
  cells = cells.keep,
  genome.use = genome.use
)

# combine matrices for same barcode pairs
names(all.counts) <- fname

all.cells <- lapply(all.counts, colnames)
common.cells <- Reduce(intersect, all.cells)
all.counts <- lapply(all.counts, function(x) x[, common.cells])
combined.matrix <- do.call(what = rbind, args = all.counts)

# create seurat object
obj <- CreateSeuratObject(
    counts = combined.matrix,
    min.cells = 10,
    assay = "all"
)

obj <- obj[, obj$nCount_all < 10000 & obj$nCount_all > 100]

for (i in seq_along(all.counts)) {
  mat <- all.counts[[i]][, colnames(obj)]
  assay <- CreateChromatinAssay(
    counts = mat,
    min.features = -1,
    fragments = fragfiles[[i]],
    annotation = annot
  )
  obj[[names(all.counts)[i]]] <- assay
}

obj <- subset(obj, subset = nCount_K27ac > 75 &
                nCount_K27me > 150 &
                nCount_Pol2 > 100)
# process
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 50)
obj <- RunSVD(obj)
obj <- RunUMAP(obj, reduction = 'lsi', dims = 2:10)

# create lsi, umap from each assay
DefaultAssay(obj) <- "K27ac"
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 10)
obj <- RunSVD(obj, reduction.name = "lsi.k27ac")
obj <- RunUMAP(obj, reduction = 'lsi.k27ac', reduction.name = "umap.k27ac", dims = 2:10)

DefaultAssay(obj) <- "K27me"
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 10)
obj <- RunSVD(obj, reduction.name = "lsi.k27me")
obj <- RunUMAP(obj, reduction = 'lsi.k27me', reduction.name = "umap.k27me", dims = 2:10)

DefaultAssay(obj) <- "Pol2"
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 10)
obj <- RunSVD(obj, reduction.name = "lsi.pol2")
obj <- RunUMAP(obj, reduction = 'lsi.pol2', reduction.name = "umap.pol2", dims = 2:10)

# WNN
obj <- FindMultiModalNeighbors(
  object = obj,
  reduction.list = list("lsi.k27ac", "lsi.k27me", "lsi.pol2"), 
  dims.list = list(2:10, 2:10, 2:10)
)

obj <- RunUMAP(
  object = obj,
  nn.name = "weighted.nn",
  reduction.name = "wnn.umap"
)

obj <- FindClusters(
  object = obj,
  graph.name = "wsnn",
  algorithm = 3,
  resolution = 0.05
)

obj <- RenameIdents(obj, list("0" = "K562", "1" = "HEK"))
obj$celltype <- Idents(obj)

# save object
saveRDS(object = obj, file = outfile)

# write cell names for vireo
writeLines(colnames(obj), "data/HEK_K562_sc/cells.txt")

# split fragment files by cell type
hek <- WhichCells(object = obj, idents = "HEK")
k562 <- WhichCells(object = obj, idents = "K562")

filter_chrom <- function(infile, outfile, chrom.keep) {
  # open input and output files
  con <- gzfile(infile, 'rb')
  outf <- file(description = outfile, "w")

  # read line by line, writing to output file
  while (TRUE) {
    line = readLines(con, n = 1)
    if (length(line) == 0) {
      break
    } else if (substring(text = line, first = 1, last = 1) == "#") {
      next # skip header
    } else {
      # check chromosome
      fields <- unlist(x = strsplit(x = line, "\t"))
      if (!(fields[[1]] %in% chrom.keep)) {
        next
      } else {
        writeLines(text = line, con = outf)
      }
    }
  }
  close(con)
  close(outf)
  system(
    command = paste0("bgzip ", outfile),
    wait = TRUE,
    ignore.stderr = FALSE,
    ignore.stdout = FALSE
  )
  system(
    command = paste0("tabix -p bed ", paste0(outfile, '.gz')),
    wait = TRUE,
    ignore.stderr = FALSE,
    ignore.stdout = FALSE
  )
}

chr.use <- names(genome.use)
for (i in seq_along(fragfiles)) {
  outf <- paste0("data/HEK_K562_sc/split/",
                 names(fragfiles)[[i]])
  FilterCells(
    fragments = fragfiles[[i]],
    cells = k562,
    outfile = paste0(outf, "_k562.bed.gz")
  )
  FilterCells(
    fragments = fragfiles[[i]],
    cells = hek,
    outfile = paste0(outf, "_hek.bed.gz")
  )

  # filter chromosomes
  filter_chrom(
    infile = paste0(outf, "_k562.bed.gz"),
    outfile = paste0(outf, "_k562_filter.bed"),
    chrom.keep = chr.use
  )
  filter_chrom(
    infile = paste0(outf, "_hek.bed.gz"),
    outfile = paste0(outf, "_hek_filter.bed"),
    chrom.keep = chr.use
  )
}

R ggplot2 tabix Seurat BSgenome.Hsapiens.UCSC.hg38 patcHwork EnsDb.Hsapiens.v86 Signac future From line 1 of HEK_K562_sc/process.R

library(Signac)
library(Seurat)
library(EnsDb.Hsapiens.v86)

annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"

me3 <- list("data/henikoff/GSM5034342_K27me3_R1_PBMC.fragments.HG38.tsv.gz",
            "data/henikoff/GSM5034343_K27me3_R2_PBMC.fragments.HG38.tsv.gz")
ac <- "data/henikoff/GSM5034344_K27ac_PBMC.fragments.HG38.tsv.gz"

# count fragments per cell
frags_me3_1 <- CountFragments(me3[[1]])
frags_me3_2 <- CountFragments(me3[[2]])
frags_ac <- CountFragments(ac)

min_frags <- 500
cells_me3_1 <- frags_me3_1[frags_me3_1$frequency_count > min_frags, "CB"]
cells_me3_2 <- frags_me3_2[frags_me3_2$frequency_count > min_frags, "CB"]
cells_ac <- frags_ac[frags_ac$frequency_count > min_frags, "CB"]

f_me3_1 <- CreateFragmentObject(me3[[1]], cells = cells_me3_1)
f_me3_2 <- CreateFragmentObject(me3[[2]], cells = cells_me3_2)
f_ac <- CreateFragmentObject(ac, cells = cells_ac)

me3_counts_1 <- FeatureMatrix(
  fragments = f_me3_1,
  features = granges(pbmc[['me3']]),
  cells = cells_me3_1
)
me3_counts_2 <- FeatureMatrix(
  fragments = f_me3_2,
  features = granges(pbmc[['me3']]),
  cells = cells_me3_2
)
ac_counts <- FeatureMatrix(
  fragments = f_ac,
  features = granges(pbmc[['ac']]),
  cells = cells_ac
)

# change cell barcodes to add batch info
colnames(me3_counts_1) <- paste0(colnames(me3_counts_1), ":1")
colnames(me3_counts_2) <- paste0(colnames(me3_counts_2), ":2")
names(f_me3_1@cells) <- paste0(names(f_me3_1@cells), ":1")
names(f_me3_2@cells) <- paste0(names(f_me3_2@cells), ":2")

me3 <- CreateSeuratObject(
  counts = CreateChromatinAssay(
    counts = cbind(me3_counts_1, me3_counts_2),
    fragments = list(f_me3_1, f_me3_2),
    annotation = annot
  ),
  names.field = 2,
  names.delim = ":",
  assay = "me3",
  meta.data = md
)

me3 <- FindTopFeatures(me3)
me3 <- RunTFIDF(me3)
me3 <- RunSVD(me3, scale.embeddings = TRUE, reduction.name = 'lsi.me3')
me3 <- RunUMAP(me3, reduction = 'lsi.me3', dims = 2:30, reduction.name = 'umap.me3')

saveRDS(object = me3, file = "objects/henikoff/me3_filt.rds")

ac <- CreateSeuratObject(
  counts = CreateChromatinAssay(
    counts = ac_counts,
    fragments = ac,
    annotation = annot
  ),
  assay = "ac",
  meta.data = md_ac
)

ac <- FindTopFeatures(ac)
ac <- RunTFIDF(ac)
ac <- RunSVD(ac, scale.embeddings = TRUE, reduction.name = 'lsi.ac')
ac <- RunUMAP(ac, reduction = 'lsi.ac', dims = 2:30, reduction.name = 'umap.ac')

saveRDS(object = ac, file = "objects/henikoff/ac_filt.rds")

R Seurat EnsDb.Hsapiens.v86 Signac From line 1 of henikoff/process.R

library(Signac)
library(ggplot2)

bigWigs <- list(
  "H3K27me3 (multi)" = "data/pbmc_bulk/mapped/plex/me3.bw",
  "H3K27ac (multi)" = "data/pbmc_bulk/mapped/plex/ac.bw",
  "H3K27me3 (mono)" = "data/pbmc_bulk/mapped/mono/me3.bw",
  "H3K27ac (mono)" = "data/pbmc_bulk/mapped/mono/ac.bw"
)

bw <- BigwigTrack(
  region = "chr1-156630171-156920171",
  bigwig = bigWigs,
  y_label = "Normalized coverage",
  ymax = "q90",
  bigwig.scale = "common",
  smooth = 1000
) + theme(legend.position = "none")

bw <- bw + scale_fill_manual(values = c("#D3145A", "#F98401",
                                        "#676767", "#676767"))

ggsave("plots/pbmc_bulk/bulk_covplot_pbmc.png", plot = bw, height = 5, width = 10, units = "in")

R ggplot2 Signac From line 1 of pbmc_bulk/analysis.R

library(ggplot2)
library(GenomicRanges)
library(Rsamtools)

getFrip <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  in_peak <- countTabix(tabix.file, param = regions)
  total_in_peak <- sum(unlist(in_peak))

  # close connection
  close(con = tabix.file)

  total <- system(
    command = paste0("gzip -dc ", fragfile, " | wc -l"),
    wait = TRUE,
    intern = TRUE
  )
  total <- as.numeric(total)

  frip <- total_in_peak / total
  return(frip)
}


# bulk fragments
me3_mono <- "data/pbmc_bulk/mapped/mono/me3.bed.gz"
ac_mono <- "data/pbmc_bulk/mapped/mono/ac.bed.gz"
me3_plex <- "data/pbmc_bulk/mapped/plex/me3.bed.gz"
ac_plex <- "data/pbmc_bulk/mapped/plex/ac.bed.gz"

regions_ac <- read.table("data/encode/ENCFF832RWT.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)

regions_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz", sep = "\t",
                          col.names = c("chromosome", "start", "end", "name", "score",
                                        "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)

frip_me3_mono_me3_peak <- getFrip(fragfile = me3_mono, regions = regions_me3)
frip_me3_plex_me3_peak <- getFrip(fragfile = me3_plex, regions = regions_me3)
frip_ac_mono_me3_peak <- getFrip(fragfile = ac_mono, regions = regions_me3)
frip_ac_plex_me3_peak <- getFrip(fragfile = ac_plex, regions = regions_me3)

frip_me3_mono_ac_peak <- getFrip(fragfile = me3_mono, regions = regions_ac)
frip_me3_plex_ac_peak <- getFrip(fragfile = me3_plex, regions = regions_ac)
frip_ac_mono_ac_peak <- getFrip(fragfile = ac_mono, regions = regions_ac)
frip_ac_plex_ac_peak <- getFrip(fragfile = ac_plex, regions = regions_ac)

me3_peak <- data.frame(
  frip = c(frip_me3_mono_me3_peak, frip_me3_plex_me3_peak, frip_ac_mono_me3_peak, frip_ac_plex_me3_peak),
  assay = c("H3K27me3_mono", "H3K27me3_plex", "H3K27ac_mono", "H3K27ac_plex")
)
me3_peak$assay <- factor(me3_peak$assay, levels = c("H3K27me3_plex", "H3K27ac_plex", "H3K27me3_mono", "H3K27ac_mono"))

p1 <- ggplot(me3_peak, aes(assay, frip, fill = assay)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("#D3145A", "#F98401", "#676767", "#676767")) +
  theme_bw() +
  theme(legend.position = "none") +
  ylab("FRiP") +
  ggtitle("Fraction of reads in H3K27me3 peaks")

ac_peak <- data.frame(
  frip = c(frip_me3_mono_ac_peak, frip_me3_plex_ac_peak, frip_ac_mono_ac_peak, frip_ac_plex_ac_peak),
  assay = c("H3K27me3_mono", "H3K27me3_plex", "H3K27ac_mono", "H3K27ac_plex")
)
ac_peak$assay <- factor(ac_peak$assay, levels = c("H3K27me3_plex", "H3K27ac_plex", "H3K27me3_mono", "H3K27ac_mono"))

p2 <- ggplot(ac_peak, aes(assay, frip, fill = assay)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("#D3145A", "#F98401", "#676767", "#676767")) +
  theme_bw() +
  theme(legend.position = "none") +
  ylab("FRiP") +
  ggtitle("Fraction of reads in H3K27ac peaks")

ggsave(filename = "plots/pbmc_bulk/frip_me3.png", plot = p1, height = 4, width = 4)
ggsave(filename = "plots/pbmc_bulk/frip_ac.png", plot = p2, height = 4, width = 4)

R ggplot2 tabix GenomicRanges Rsamtools From line 1 of pbmc_bulk/frip.R

library(ggplot2)
library(patchwork)
library(GenomicRanges)
library(Rsamtools)


me3_mono <- "data/pbmc_bulk/mapped/mono/me3.bed.gz"
ac_mono <- "data/pbmc_bulk/mapped/mono/ac.bed.gz"
me3_plex <- "data/pbmc_bulk/mapped/plex/me3.bed.gz"
ac_plex <- "data/pbmc_bulk/mapped/plex/ac.bed.gz"

regions_ac <- read.table("data/encode/ENCFF832RWT.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
regions_ac$peak <- "H3K27ac"

regions_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
regions_me3$peak <- "H3K27me3"

regions <- c(regions_me3, regions_ac)

QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # get total fragments for normalization
  # just read whole thing
  f <- read.table(fragfile, sep = "\t", header = FALSE)
  total <- nrow(f)
  rm(f)
  gc()

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / total) * 10^6
  return(cmp)
}

get_cod_expression <- function(m) {
  eq <- substitute(~~italic(R)^2~"="~r2, 
                   list(r2 = format(summary(m)$r.squared, digits = 2)))
  cod <- as.expression(eq)
  return(cod)
}

ac_plex_cov <- QuantifyRegions(ac_plex, regions)
ac_mono_cov <- QuantifyRegions(ac_mono, regions)
me3_plex_cov <- QuantifyRegions(me3_plex, regions)
me3_mono_cov <- QuantifyRegions(me3_mono, regions)

mat <- cbind(ac_plex_cov, ac_mono_cov, me3_plex_cov, me3_mono_cov)

mat <- as.data.frame(mat)
mat$region <- regions$peak
colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

mat <- mat[sample(nrow(mat)), ]

# p-m
m.use <- mat[mat$region == "H3K27ac", ]
ac_plex_mono <- ggplot(
  data = m.use,
  mapping = aes(ac_plex_cov, ac_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 800)) + ylim(c(0, 800)) +
  xlab("H3K27ac (multiple antibody)") +
  ylab("H3K27ac (single antibody)") +
  ggtitle(get_cod_expression(lm(ac_plex_cov ~ ac_mono_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region == "H3K27me3", ]
me3_plex_mono <- ggplot(
  data = m.use,
  mapping = aes(me3_plex_cov, me3_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 100)) + ylim(c(0, 100)) +
  xlab("H3K27me3 (multiple antibody)") +
  ylab("H3K27me3 (single antibody)") +
  ggtitle(get_cod_expression(lm(me3_plex_cov ~ me3_mono_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ac_plex_me3_mono <- ggplot(
  data = mat,
  mapping = aes(ac_plex_cov, me3_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (multiple antibody)") +
  ylab("H3K27me3 (single antibody)") +
  ggtitle(get_cod_expression(lm(ac_plex_cov ~ me3_mono_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ac_mono_me3_plex <- ggplot(
  data = mat,
  mapping = aes(ac_mono_cov, me3_plex_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single antibody)") +
  ylab("H3K27me3 (multiple antibody)") +
  ggtitle(get_cod_expression(lm(ac_mono_cov ~ me3_plex_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# m-m
ac_mono_me3_mono <- ggplot(
  data = mat,
  mapping = aes(ac_mono_cov, me3_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single antibody)") +
  ylab("H3K27me3 (single antibody)") +
  ggtitle(get_cod_expression(lm(ac_mono_cov ~ me3_mono_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# p-p
ac_plex_me3_plex <- ggplot(
  data = mat,
  mapping = aes(ac_plex_cov, me3_plex_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (multiple antibody)") +
  ylab("H3K27me3 (multiple antibody)") +
  ggtitle(get_cod_expression(lm(ac_plex_cov ~ me3_plex_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

pp <- (ac_mono_me3_mono | ac_mono_me3_plex | ac_plex_me3_mono | ac_plex_me3_plex | ac_plex_mono | me3_plex_mono) & theme(legend.position = "none")
ggsave(filename = "plots/pbmc/bulk_scatter.png", plot = pp, height = 3.5, width = 16, dpi = 600)

R ggplot2 tabix GenomicRanges patcHwork Rsamtools From line 1 of pbmc_bulk/quantify_regions.R

library(Signac)
library(Seurat)
library(ggplot2)
library(patchwork)
library(dplyr)


args <- commandArgs(trailingOnly = TRUE)
ntt <- readRDS(args[[1]])
ct_ac <- readRDS(args[[2]])
ct_me <- readRDS(args[[3]])
pbmc_sc <- readRDS(args[[4]])

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

# QC plots
me3_frag <- Fragments(ntt[['me3']])[[1]]@path
ac_frag <- Fragments(ntt[['ac']])[[1]]@path

tot_me3 <- CountFragments(
  fragments = me3_frag,
  cells = colnames(ntt)
)
tot_me3$mark <- "H3K27me3"
tot_ac <- CountFragments(
  fragments = ac_frag,
  cells = colnames(ntt)
)
tot_ac$mark <- "H3K27ac"

all_counts <- rbind(tot_me3, tot_ac)

mean(tot_me3$frequency_count) # 2854
mean(tot_ac$frequency_count) # 412
sd(tot_me3$frequency_count) # 2953
sd(tot_ac$frequency_count) # 356

# pbmc replicate
me3_frag_2 <- Fragments(pbmc_sc[['me3']])[[1]]@path
ac_frag_2 <- Fragments(pbmc_sc[['ac']])[[1]]@path

tot_me3_2 <- CountFragments(
  fragments = me3_frag_2,
  cells = colnames(pbmc_sc)
)
tot_me3_2$mark <- "H3K27me3"
tot_ac_2 <- CountFragments(
  fragments = ac_frag_2,
  cells = colnames(pbmc_sc)
)
tot_ac_2$mark <- "H3K27ac"

all_counts_2 <- rbind(tot_me3_2, tot_ac_2)

mean(tot_me3_2$frequency_count) # 670
mean(tot_ac_2$frequency_count) # 731
sd(tot_me3_2$frequency_count) # 1243
sd(tot_ac_2$frequency_count) # 1035

all_counts$dataset <- "scNTT-seq"
all_counts_2$dataset <- "scNTT-seq"

# ct henikoff
me3_frag_ct <- Fragments(ct_me[['tiles']])[[1]]@path
ac_frag_ct <- Fragments(ct_ac[["tiles"]])[[1]]@path

tot_me3_ct <- CountFragments(
  fragments = me3_frag_ct,
  cells = colnames(ct_me)
)
tot_me3_ct$mark <- "H3K27me3"
tot_ac_ct <- CountFragments(
  fragments = ac_frag_ct,
  cells = colnames(ct_ac)
)
tot_ac_ct$mark <- "H3K27ac"

all_counts_ct <- rbind(tot_me3_ct, tot_ac_ct)

mean(tot_me3_ct$frequency_count) # 519
mean(tot_ac_ct$frequency_count) # 283
sd(tot_me3_ct$frequency_count) # 383
sd(tot_ac_ct$frequency_count) # 272

all_counts_ct$dataset <- "scCUT&Tag"

all_counts <- rbind(all_counts, all_counts_ct)

p0 <- ggplot(all_counts, aes(mark, frequency_count, fill = dataset)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  ylab("Fragments") +
  xlab("Mark") +
  ggtitle("Total fragments per cell")

p01 <- ggplot(all_counts, aes(mark, reads_count, fill = dataset)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  ylab("Reads") +
  xlab("Mark") +
  ggtitle("Total reads per cell") +
  theme(legend.position = "none")

ggsave(filename = "plots/pbmc/fragments.png", plot = p01 | p0, height = 4, width = 7)

# replicate 2
p0 <- ggplot(all_counts_2, aes(mark, frequency_count, fill = mark)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  scale_fill_manual(values = colormap[1:2]) +
  ylab("Fragments") +
  xlab("Mark") +
  ggtitle("Total fragments per cell") +
  theme(legend.position = "none")

p01 <- ggplot(all_counts_2, aes(mark, reads_count, fill = mark)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  scale_fill_manual(values = colormap[1:2]) +
  ylab("Reads") +
  xlab("Mark") +
  ggtitle("Total reads per cell") +
  theme(legend.position = "none")

ggsave(filename = "plots/pbmc/fragments_rep2.png", plot = p01 | p0, height = 4, width = 7)


t1 <- TSSPlot(ntt, assay = "me3", group.by = "orig.ident") +
  ggtitle("H3K27me3") + scale_color_manual(values = colormap$H3K27me3)
t2 <- TSSPlot(ntt, assay = "ac", group.by = "orig.ident") +
  ggtitle("H3K27ac") + scale_color_manual(values = colormap$H3K27ac)
tt <- (t1 | t2) & ylim(c(0, 7))
ggsave(filename = "plots/pbmc/tss.png", plot = tt, height = 8, width = 5)

p1 <- DimPlot(ntt, reduction = "umap.me3", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("H3K27me3") + NoLegend()
p2 <- DimPlot(ntt, reduction = "umap.ac", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("H3K27ac") + NoLegend()
p3 <- DimPlot(ntt, reduction = "umap.adt", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("ADT") + NoLegend()
p4 <- DimPlot(ntt, reduction = "umap.wnn", label = TRUE, label.size = 4) + ggtitle("PBMCs: H3K27me3 + H3K27ac + protein") + theme_classic() + NoLegend() + ylab("UMAP 1") + xlab("UMAP 2") 

pp <- ((p1 / p2 / p3) | p4) + plot_layout(widths = c(1, 2))
ggsave(filename = "plots/pbmc/dimplots.png", plot = pp, height = 5, width = 7)

# wnn weight
v1 <- VlnPlot(ntt, c("ac.weight", "me3.weight", "ADT.weight"), ncol = 1, pt.size = 0)
ggsave(filename = "plots/pbmc/wnn_weight.png", plot = v1, height = 8, width = 12)

# adt sensitivity
df <- ntt[[]]
df$dataset <- "scNTT-seq"
df <- df[, c("nCount_ADT", "dataset")]
mean(df$nCount_ADT) # 690
sd(df$nCount_ADT)   # 613

df2 <- ct_me[[]]
df2$dataset <- "CUT&Tag-pro (H3K27me3)"
df2 <- df2[, c("nCount_ADT", "dataset")]

df3 <- ct_ac[[]]
df3$dataset <- "CUT&Tag-pro (H3K27ac)"
df3 <- df3[, c("nCount_ADT", "dataset")]

df <- rbind(df, df2, df3)

v2 <- ggplot(data = df, aes(dataset, nCount_ADT, fill = dataset)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() +
  theme(axis.title.x=element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        legend.position = 'none') +
  ggtitle("ADT counts per cell")

ggsave(filename = "plots/pbmc/ncount_adt.png", plot = v2, height = 4, width = 2.5)

# featureplots adt
DefaultAssay(ntt) <- "ADT"
fp <- FeaturePlot(
  ntt,
  c("CD14", "CD19", "CD4", "CD8A", "IL2RB", "TFRC"),
  pt.size = 0.1,
  max.cutoff = 'q99',
  cols = c("lightgrey", "darkgreen"),
  ncol = 3,
  reduction = 'umap.wnn'
) & xlab("UMAP 1") & ylab("UMAP 2") & NoLegend()

ggsave(filename = "plots/pbmc/featureplots.png", plot = fp, height = 6, width = 8)

DefaultAssay(ntt) <- "me3"
cp <- CoveragePlot(
  object = ntt,
  region = "PAX5",
  assay = list("me3", "ac"),
  split.assays = FALSE,
  assay.scale = "separate",
  features = c("CD19"),
  expression.assay = "ADT",
  idents = c("CD14+ Mono", "B cell"),
  extend.upstream = -30000,
  extend.downstream = 10000,
  window = 500,
  peaks = FALSE
)
cp[[1]][[1]] <- cp[[1]][[1]] + scale_fill_manual(values = c(colormap$H3K27me3, colormap$H3K27ac))
ggsave(filename = "plots/pbmc/pax5_covplot.png", plot = cp, height = 3, width = 8)

# colorscale
cols <- colorRampPalette(c("lightgrey", "darkgreen"))(100)

png("plots/pbmc/colorbar.png", width = 10, height = 4, units = "in", res = 400)
image(matrix(1:100), col=cols)
dev.off()

cp <- CoveragePlot(
  object = ntt,
  region = "CD33",
  assay = list("me3", "ac"),
  split.assays = FALSE,
  assay.scale = "separate",
  features = c("CD33"),
  expression.assay = "ADT",
  idents = c("CD14+ Mono", "B cell"),
  extend.upstream = 2000,
  extend.downstream = 2000,
  window = 500,
  peaks = FALSE
)
cp[[1]][[1]] <- cp[[1]][[1]] + scale_fill_manual(values = c(colormap$H3K27me3, colormap$H3K27ac))
ggsave(filename = "plots/pbmc/cd33_covplot.png", plot = cp, height = 3, width = 8)

# run WNN on pair of chromatin assays, quantify how well structure matches protein neighbor graph

ntt <- FindMultiModalNeighbors(
  object = ntt,
  reduction.list = list("lsi.me3", "lsi.ac"),
  dims.list = list(2:30, 2:30),
  knn.graph.name = "wknn.chrom",
  weighted.nn.name = "wnn.chrom",
  modality.weight.name = "wnn.chrom"
)

# compute KNN purity using ADT-defined cell types, compare different graphs
knn_purity_graph <- function(graph, clusters) {
  # pre-existing graph, can't change k
  nn_purity <- vector(mode = "numeric", length = length(x = clusters))
  for (i in seq_len(length.out = nrow(x = graph))) {
    nbr <- names(which(graph[i, ] == 1))
    nn_purity[i] <- sum(clusters[nbr] == clusters[i]) / sum(graph[i, ])
  }
  return(nn_purity)
}

clusters <- ntt$celltype

ntt <- FindNeighbors(ntt, reduction = "lsi.me3", dims = 2:30, k.param = 100)
ntt <- FindNeighbors(ntt, reduction = "lsi.ac", dims = 2:30, k.param = 100)
ntt <- FindNeighbors(ntt, reduction = "pca", dims = 1:30, k.param = 100)

graph.me3 <- ntt[['me3_nn']]
graph.ac <- ntt[['ac_nn']]
graph.chrom <- ntt[['wknn.chrom']]
graph.adt <- ntt[['ADT_nn']]
graph.wnn <- ntt[['wknn']]

kme3 <- data.frame(
  purity = knn_purity_graph(graph = graph.me3, clusters = clusters),
  assay = "H3K27me3",
  celltype = clusters
)
kac <- data.frame(
  purity = knn_purity_graph(graph = graph.ac, clusters = clusters),
  assay = "H3K27ac",
  celltype = clusters
)
kchrom <- data.frame(
  purity = knn_purity_graph(graph = graph.chrom, clusters = clusters),
  assay = "H3K27me3 + H3K27ac",
  celltype = clusters
)
kadt <- data.frame(
  purity = knn_purity_graph(graph = graph.adt, clusters = clusters),
  assay = "ADT",
  celltype = clusters
)
kwnn <- data.frame(
  purity = knn_purity_graph(graph = graph.wnn, clusters = clusters),
  assay = "ADT + H3K27me3 + H3K27ac",
  celltype = clusters
)

kdf <- rbind(kme3, kac, kchrom, kadt)
kdf$assay <- factor(kdf$assay, levels = c("H3K27ac", "H3K27me3", "H3K27me3 + H3K27ac", "ADT"))

df <- kdf %>% 
  group_by(assay) %>% 
  mutate(total_025 = sum(purity < 0.25) / n()) %>% 
  select(assay, total_025) %>% 
  unique()

knn_p <- ggplot(df, aes(assay, total_025, fill = assay)) +
  geom_bar(stat = "identity") +
  theme_classic() +
  theme(legend.position = "none") +
  xlab("Assay") + ylab("Fraction of cells with \n<25% neighbors same celltype") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  scale_fill_manual(
    values = c(colormap$H3K27ac, colormap$H3K27me3, mixed$ma, "darkgreen")
  )

ggsave(filename = "plots/pbmc/knn_purity.pdf", plot = knn_p, height = 4, width = 3)

R ggplot2 dplyr Seurat patcHwork Signac From line 1 of pbmc_protein/analysis.R

library(ggplot2)
library(patchwork)

mat_ac <- read.table("data/pbmc_protein/bigwig/raw_ac.tsv", sep = "\t", header = TRUE, comment.char = "")
mat_ac <- mat_ac[, 4:ncol(mat_ac)] # remove region
mat_ac <- as.matrix(mat_ac)
mat_ac[is.nan(mat_ac)] <- 0
mat_ac <- as.data.frame(mat_ac)

mat_me3 <- read.table("data/pbmc_protein/bigwig/raw_me3.tsv", sep = "\t", header = TRUE, comment.char = "")
mat_me3 <- mat_me3[, 4:ncol(mat_me3)] # remove region
mat_me3 <- as.matrix(mat_me3)
mat_me3[is.nan(mat_me3)] <- 0
mat_me3 <- as.data.frame(mat_me3)

encode <- list("ENCFF569IPX.bigWig" = "NK_H3K27me3_chip",
               "ENCFF293ETP.bigWig" = "NK_H3K27ac_chip",
               "ENCFF842JLZ.bigWig" = "CD8_memory_H3K27me3_chip",
               "ENCFF611GRL.bigWig" = "CD8_memory_H3K27ac_chip",
               "ENCFF181OXO.bigWig" = "CD4_H3K27ac_chip",
               "ENCFF811VAX.bigWig" = "CD4_H3K27me3_chip",
               "ENCFF526VJO.bigWig" = "CD8_H3K27ac_chip",
               "ENCFF499VWN.bigWig" = "CD8_H3K27me3_chip",
               "ENCFF777EGG.bigWig" = "CD14_mono_H3K27me3_chip",
               "ENCFF601NLG.bigWig" = "CD14_mono_H3K27ac_chip",
               "ENCFF951ZBV.bigWig" = "B_cell_H3K27ac_chip",
               "ENCFF046VLL.bigWig" = "B_cell_H3K27me3_chip",
               "ENCFF085YMB.bigWig" = "CMP_CD34_H3K27me3_chip",
               "ENCFF064JOI.bigWig" = "CMP_CD34_H3K27ac_chip")

process_matrix <- function(mat, encode, method = "pearson") {
    encode_mat <- colnames(mat) %in% names(encode)
    colnames(mat)[encode_mat] <- unname(encode[colnames(mat)[encode_mat]])
    colnames(mat) <- gsub(".bw", "", colnames(mat), fixed = TRUE)

    # work out me3 or ac
    if (any(grepl("me3", colnames(mat)))) {
        mat <- mat[, c("B_cell_me3", "B_cell_H3K27me3_chip",
               "NK_me3", "NK_H3K27me3_chip",
               "CD14_Mono_me3", "CD14_mono_H3K27me3_chip",
               "CD8_T_cell_me3", "CD8_H3K27me3_chip",
               "CD4_T_cell_me3", "CD4_H3K27me3_chip",
               "Late_erythroid_me3", "CMP_CD34_H3K27me3_chip")]
    } else {
        mat <- mat[, c("B_cell_ac", "B_cell_H3K27ac_chip",
               "NK_ac", "NK_H3K27ac_chip",
               "CD14_Mono_ac", "CD14_mono_H3K27ac_chip",
               "CD8_T_cell_ac", "CD8_H3K27ac_chip",
               "CD4_T_cell_ac", "CD4_H3K27ac_chip",
               "Late_erythroid_ac", "CMP_CD34_H3K27ac_chip")]
    }
    all_cor <- cor(mat, method = method)
    ac <- reshape2::melt(all_cor)
    ac <- ac[ac$Var1 %in% encode, ]
    ac <- ac[ac$Var2 %in% setdiff(colnames(mat), encode), ]
    ac$`Pearson correlation` <- ac$value
    return(ac)
}

me3 <- process_matrix(mat=mat_me3, encode=encode, method = "spearman")
ac <- process_matrix(mat=mat_ac, encode=encode, method = "spearman")

cells_keep <- c("CD14_Mono_me3", "Late_erythroid_me3", "B_cell_me3")
encode_keep <- c("B_cell_H3K27me3_chip", "CD14_mono_H3K27me3_chip", "CMP_CD34_H3K27me3_chip")

me3 <- me3[me3$Var2 %in% cells_keep, ]
me3 <- me3[me3$Var1 %in% encode_keep, ]

cells_keep <- c("CD14_Mono_ac", "Late_erythroid_ac", "B_cell_ac")
encode_keep <- c("B_cell_H3K27ac_chip", "CD14_mono_H3K27ac_chip", "CMP_CD34_H3K27ac_chip")

ac <- ac[ac$Var2 %in% cells_keep, ]
ac <- ac[ac$Var1 %in% encode_keep, ]

replacement_encode <- list("B_cell_H3K27ac_chip" = "B cell",
                           "CD14_mono_H3K27ac_chip" = "CD14 Monocyte",
                           "CMP_CD34_H3K27ac_chip" = "CD34+ CMP")

replacement_sc <- list("B_cell_ac" = "B cell",
                       "CD14_Mono_ac" = "CD14 Monocyte",
                       "Late_erythroid_ac" = "Late_erythroid")

ac$Var1 <- as.character(replacement_encode[as.character(ac$Var1)])
ac$Var2 <- as.character(replacement_sc[as.character(ac$Var2)])

replacement_encode <- list("B_cell_H3K27me3_chip" = "B cell",
                           "CD14_mono_H3K27me3_chip" = "CD14 Monocyte",
                           "CMP_CD34_H3K27me3_chip" = "CD34+ CMP")

replacement_sc <- list("B_cell_me3" = "B cell",
                       "CD14_Mono_me3" = "CD14 Monocyte",
                       "Late_erythroid_me3" = "Late_erythroid")

me3$Var1 <- as.character(replacement_encode[as.character(me3$Var1)])
me3$Var2 <- as.character(replacement_sc[as.character(me3$Var2)])

p1 <- ggplot(me3, aes(Var1, Var2, fill = `Pearson correlation`)) +
  geom_tile() +
  theme_classic() +
  scale_fill_distiller(palette = "RdYlBu") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  xlab("Bulk-cell ChIP-seq") +
  ylab("Single-cell multiplexed NTT-seq") +
  ggtitle("H3K27me3") +
  theme(legend.position = "none")

p2 <- ggplot(ac, aes(Var1, Var2, fill = `Pearson correlation`)) +
  geom_tile() +
  theme_classic() +
  scale_fill_distiller(palette = "RdYlBu") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  xlab("Bulk-cell ChIP-seq") +
  ylab("Single-cell multiplexed NTT-seq") +
  ggtitle("H3K27ac")

pp <- (p1 / p2)

ggsave(filename = "plots/pbmc/encode_cor_spearman.png", plot = pp, height = 6, width = 5)

# correlation between replicates
mat <- read.table("data/pbmc_protein/replicate_cor.tsv", sep = "\t", header = TRUE, comment.char = "")
mat <- mat[, 4:ncol(mat)] # remove region
mat <- as.matrix(mat)
mat[is.nan(mat)] <- 0
mat <- as.data.frame(mat)

colnames(mat) <- c(
  "Rep. 1 H3K27me3",
  "Rep. 1 H3K27ac",
  "Rep. 2 H3K27me3",
  "Rep. 2 H3K27ac"
)

mat <- mat[, c("Rep. 1 H3K27me3", "Rep. 2 H3K27me3", "Rep. 1 H3K27ac", "Rep. 2 H3K27ac")]

all_cor <- cor(mat, method = "pearson")
ac <- reshape2::melt(all_cor)
ac$`Pearson correlation` <- ac$value

p1 <- ggplot(ac, aes(Var1, Var2, fill = `Pearson correlation`)) +
  geom_tile() +
  theme_classic() +
  scale_fill_distiller(palette = "RdBu") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  xlab("") +
  ylab("")

ggsave("plots/pbmc/replicate_correlation.png", plot = p1, height = 3, width = 5)

R ggplot2 patcHwork From line 1 of pbmc_protein/encode_cor.R

library(Signac)
library(Seurat)
library(GenomicRanges)
library(ggplot2)

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401")

pbmc <- readRDS("objects/pbmc_protein.rds")
pbmc_sc <- readRDS("objects/pbmc_sc.rds")
henikoff_me3_filt <- readRDS("objects/henikoff/me3_filt.rds")
henikoff_ac_filt <- readRDS("objects/henikoff/ac_filt.rds")

# ENCODE peaks
pk_ac <- read.table("data/encode/ENCFF832RWT.bed.gz")
pk_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz")

pk_ac <- makeGRangesFromDataFrame(df = pk_ac, seqnames.field = "V1", start.field = "V2", end.field = "V3")
pk_me3 <- makeGRangesFromDataFrame(df = pk_me3, seqnames.field = "V1", start.field = "V2", end.field = "V3")

# total fragments for each dataset

get_total <- function(frags, cells) {
  tot <- CountFragments(fragments = frags, cells = cells)
  totals <- tot$frequency_count
  names(totals) <- tot$CB
  return(totals)
}

# pbmc
totals_pbmc_ac <- get_total(Fragments(pbmc[['ac']])[[1]]@path, colnames(pbmc))
totals_pbmc_me3 <- get_total(Fragments(pbmc[['me3']])[[1]]@path, cells = colnames(pbmc))

# new replicate
totals_pbmc_ac_n <- get_total(Fragments(pbmc_sc[['ac']])[[1]]@path, cells = colnames(pbmc_sc))
totals_pbmc_me3_n <- get_total(Fragments(pbmc_sc[['me3']])[[1]]@path, cells = colnames(pbmc_sc))

# henikoff
r1_cells_filt <- henikoff_me3_filt$orig.ident == '1'
r2_cells_filt <- henikoff_me3_filt$orig.ident == '2'
totals_h_me3_1_f <- get_total(Fragments(henikoff_me3_filt)[[1]]@path, cells = henikoff_me3_filt$barcode[r1_cells_filt])
totals_h_me3_2_f <- get_total(Fragments(henikoff_me3_filt)[[2]]@path, cells = henikoff_me3_filt$barcode[r2_cells_filt])
names(totals_h_me3_1_f) <- paste0(names(totals_h_me3_1_f), ":1")
names(totals_h_me3_2_f) <- paste0(names(totals_h_me3_2_f), ":2")
totals_h_me3_f <- c(totals_h_me3_1_f, totals_h_me3_2_f)

totals_h_ac_f <- get_total(Fragments(henikoff_ac_filt)[[1]]@path, cells = colnames(henikoff_ac_filt))

# total counts in peak regions
get_in_peaks <- function(obj, regions) {
  counts <- FeatureMatrix(
    fragments = Fragments(obj),
    features = regions,
    cells = colnames(obj)
  )
  return(colSums(counts))
}

# ntt
in_peak_pbmc_ac_ac <- get_in_peaks(obj = pbmc[['ac']], regions = pk_ac)
in_peak_pbmc_ac_me <- get_in_peaks(obj = pbmc[['ac']], regions = pk_me3)
in_peak_pbmc_me_me <- get_in_peaks(obj = pbmc[['me3']], regions = pk_me3)
in_peak_pbmc_me_ac <- get_in_peaks(obj = pbmc[['me3']], regions = pk_ac)

# rep2
in_peak_pbmc_ac_ac_n <- get_in_peaks(obj = pbmc_sc[['ac']], regions = pk_ac)
in_peak_pbmc_ac_me_n <- get_in_peaks(obj = pbmc_sc[['ac']], regions = pk_me3)
in_peak_pbmc_me_me_n <- get_in_peaks(obj = pbmc_sc[['me3']], regions = pk_me3)
in_peak_pbmc_me_ac_n <- get_in_peaks(obj = pbmc_sc[['me3']], regions = pk_ac)

# henikoff filt
in_peak_h_ac_ac_f <- get_in_peaks(obj = henikoff_ac_filt, regions = pk_ac)
in_peak_h_ac_me_f <- get_in_peaks(obj = henikoff_ac_filt, regions = pk_me3)
in_peak_h_me_me_f <- get_in_peaks(obj = henikoff_me3_filt, regions = pk_me3)
in_peak_h_me_ac_f <- get_in_peaks(obj = henikoff_me3_filt, regions = pk_ac)

# frip
make_df <- function(totals, inpeak, assay, pk, dataset) {
  d <- data.frame(
    cell = names(totals),
    total = totals,
    in_peak = inpeak[names(totals)],
    assay = assay,
    peak = pk,
    dataset = dataset
  )
  d$FRIP <- d$in_peak / d$total
  return(d)
}

df <- rbind(
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_ac, "H3K27ac", "H3K27ac", "scNTT-seq"),
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_me, "H3K27ac", "H3K27me3", "scNTT-seq"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_me, "H3K27me3", 'H3K27me3', "scNTT-seq"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_ac, "H3K27me3", "H3K27ac", "scNTT-seq"),

  make_df(totals_h_ac_f, in_peak_h_ac_ac_f, "H3K27ac", "H3K27ac", "scCUT&Tag"),
  make_df(totals_h_ac_f, in_peak_h_ac_me_f, "H3K27ac", "H3K27me3", "scCUT&Tag"),
  make_df(totals_h_me3_f, in_peak_h_me_me_f, "H3K27me3", "H3K27me3", "scCUT&Tag"),
  make_df(totals_h_me3_f, in_peak_h_me_ac_f, "H3K27me3", "H3K27ac", "scCUT&Tag")
)

df2 <- rbind(
  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_ac_n, "H3K27ac", "H3K27ac", "scNTT-seq"),
  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_me_n, "H3K27ac", "H3K27me3", "scNTT-seq"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_me_n, "H3K27me3", 'H3K27me3', "scNTT-seq"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_ac_n, "H3K27me3", "H3K27ac", "scNTT-seq"),
)

df_filt <- df[df$assay == df$peak, ]
df_filt <- df_filt[!is.na(df_filt$FRIP), ]

p <- ggplot(df_filt, aes(x = assay, y = FRIP, fill = dataset)) +
  geom_boxplot(outlier.size = 0.5) +
  xlab("Mark") +
  theme_bw() +
  ggtitle("FRiP") +
  theme(legend.position = 'none')

ggsave("plots/pbmc/frip_pbmc.png", plot = p, height = 4, width = 2.5)


p2 <- ggplot(df2, aes(x = assay, y = FRIP, fill = assay)) +
  geom_boxplot(outlier.size = 0.5) +
  scale_fill_manual(values = colormap) +
  xlab("Mark") +
  theme_bw() +
  ggtitle("FRiP") +
  theme(legend.position = 'none')
ggsave("plots/pbmc/frip_pbmc_rep2.png", plot = p2, height = 4, width = 2.5)

saveRDS(object = df_filt, file = "data/pbmc_protein/frip.rds")

df1 <- df_filt[df_filt$dataset == "scNTT-seq", ]

mean(df1[df1$peak == "H3K27ac", "FRIP"])  # 0.21
sd(df1[df1$peak == "H3K27ac", "FRIP"])  # 0.09
mean(df1[df1$peak == "H3K27me3", "FRIP"])  # 0.11
sd(df1[df1$peak == "H3K27me3", "FRIP"])  # 0.05

mean(df2[df2$peak == "H3K27ac", "FRIP"])  # 0.28
sd(df2[df2$peak == "H3K27ac", "FRIP"])  # 0.08
mean(df2[df2$peak == "H3K27me3", "FRIP"])  # 0.10
sd(df2[df2$peak == "H3K27me3", "FRIP"])  # 0.06


# revision
df3 <- rbind(
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_ac, "H3K27ac", "H3K27ac", "scNTT-seq (PBMC 1)"),
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_me, "H3K27ac", "H3K27me3", "scNTT-seq (PBMC 1)"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_me, "H3K27me3", 'H3K27me3', "scNTT-seq (PBMC 1)"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_ac, "H3K27me3", "H3K27ac", "scNTT-seq (PBMC 1)"),

  make_df(totals_h_ac_f, in_peak_h_ac_ac_f, "H3K27ac", "H3K27ac", "scCUT&Tag (PBMC)"),
  make_df(totals_h_ac_f, in_peak_h_ac_me_f, "H3K27ac", "H3K27me3", "scCUT&Tag (PBMC)"),
  make_df(totals_h_me3_f, in_peak_h_me_me_f, "H3K27me3", "H3K27me3", "scCUT&Tag (PBMC)"),
  make_df(totals_h_me3_f, in_peak_h_me_ac_f, "H3K27me3", "H3K27ac", "scCUT&Tag (PBMC)"),

  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_ac_n, "H3K27ac", "H3K27ac", "scNTT-seq (PBMC 2)"),
  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_me_n, "H3K27ac", "H3K27me3", "scNTT-seq (PBMC 2)"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_me_n, "H3K27me3", 'H3K27me3', "scNTT-seq (PBMC 2)"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_ac_n, "H3K27me3", "H3K27ac", "scNTT-seq (PBMC 2)")
)


bmmc_frip <- readRDS(file = "data/bmmc_dual/frip.rds")
bmmc_frip$dataset <- "scNTT-seq (BMMC)"

df3 <- rbind(bmmc_frip, df3)
df_filt <- df3[df3$assay == df3$peak, ]
df_filt <- df_filt[!is.na(df_filt$FRIP), ]

p <- ggplot(df_filt, aes(x = assay, y = FRIP, fill = dataset)) +
  geom_boxplot(outlier.size = 0.5) +
  xlab("Mark") +
  theme_bw() +
  ggtitle("Fragments in peaks")

ggsave("plots/pbmc/frip_pbmc_revision.png", plot = p, height = 4, width = 6)

R ggplot2 Seurat GenomicRanges Signac From line 1 of pbmc_protein/frip.R

library(Seurat)
library(Signac)
library(BSgenome.Hsapiens.UCSC.hg38)
library(EnsDb.Hsapiens.v86)
library(tximport)
library(future)


args <- commandArgs(trailingOnly = TRUE)
threads <- as.numeric(args[[1]])
adt <- args[[2]]
ac_frag <- args[[3]]
me3_frag <- args[[4]]
outfile <- args[[5]]

plan("multicore", workers = as.numeric(threads))
options(future.globals.maxSize = Inf)

# process chromatin data
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"

total_me3 <- CountFragments(fragments = me3_frag)
total_ac <- CountFragments(fragments = ac_frag)

total_ac <- total_ac[total_ac$frequency_count > 100, ]
total_me3 <- total_me3[total_me3$frequency_count > 100, ]
cells_keep <- intersect(total_ac$CB, total_me3$CB)

frag_obj_me3 <- CreateFragmentObject(path = me3_frag, cells = cells_keep)
frag_obj_ac <- CreateFragmentObject(path = ac_frag, cells = cells_keep)

chr_use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:22]
counts_me3 <- AggregateTiles(
  object = frag_obj_me3,
  genome = chr_use,
  cells = cells_keep,
  min_counts = 1
)

counts_ac <- AggregateTiles(
  object = frag_obj_ac,
  genome = chr_use,
  cells = cells_keep,
  min_counts = 1
)

# remove features overlapping blacklist
remove_blacklist <- function(counts) {
  olap <- findOverlaps(query = StringToGRanges(rownames(counts)), subject = blacklist_hg38_unified)
  is_bl <- queryHits(olap)
  counts <- counts[setdiff(1:nrow(counts), is_bl), ]
  return(counts)
}

assay_me3 <- CreateChromatinAssay(
  counts = remove_blacklist(counts_me3),
  fragments = me3_frag,
  annotation = annot
)
assay_ac <- CreateChromatinAssay(
  counts = remove_blacklist(counts_ac),
  fragments = ac_frag,
  annotation = annot
)
obj <- CreateSeuratObject(counts = assay_me3, assay = "me3")
obj[['ac']] <- assay_ac

# add adt data
adt.txi <- tximport(files = adt, type = "alevin")
cells <- intersect(colnames(obj), colnames(adt.txi$counts))
obj <- obj[, cells]
obj[['ADT']] <- CreateAssayObject(counts = adt.txi$counts[, cells])

obj <- subset(obj, subset = nCount_me3 < 40000 &
                nCount_ac < 10000 &
                nCount_me3 > 300 &
                nCount_ac > 100 &
                nCount_ADT < 10000 &
                nCount_ADT > 120)

DefaultAssay(obj) <- "ADT"
obj <- NormalizeData(obj, normalization.method = "CLR", margin = 2)
igg_features <- rownames(obj)[grepl("^Ig", x = rownames(obj))]
var_feat <- setdiff(rownames(obj), igg_features)
VariableFeatures(obj) <- var_feat
obj <- ScaleData(obj)
obj <- RunPCA(obj)
obj <- FindNeighbors(obj, reduction = "pca", dims = 1:40)
obj <- FindClusters(obj)
obj <- RunUMAP(obj, reduction = 'pca', reduction.name = "umap.adt", dims = 1:40)

# remove artifact clusters
obj <- subset(obj, idents = c(4, 8), invert = TRUE)

obj <- NormalizeData(obj, normalization.method = "CLR", margin = 2)
obj <- ScaleData(obj)
obj <- RunPCA(obj)
obj <- FindNeighbors(obj, reduction = "pca", dims = 1:40)
obj <- FindClusters(obj)
obj <- RunUMAP(obj, reduction = 'pca', reduction.name = "umap.adt", dims = 1:40)

DefaultAssay(obj) <- "me3"
feat.keep <- names(which(rowSums(obj, slot = 'counts') > 1)) # remove low-count features
obj[['me3']] <- subset(obj[['me3']], features = feat.keep)
obj <- FindTopFeatures(obj)
obj <- RunTFIDF(obj)
obj <- RunSVD(obj, scale.embeddings = TRUE, reduction.name = 'lsi.me3')
obj <- RunUMAP(obj, reduction = 'lsi.me3', dims = 2:30, reduction.name = 'umap.me3')

DefaultAssay(obj) <- "ac"
feat.keep <- names(which(rowSums(obj, slot = 'counts') > 1)) # remove low-count features
obj[['ac']] <- subset(obj[['ac']], features = feat.keep)
obj <- FindTopFeatures(obj)
obj <- RunTFIDF(obj)
obj <- RunSVD(obj, scale.embeddings = TRUE, reduction.name = 'lsi.ac')
obj <- RunUMAP(obj, reduction = 'lsi.ac', dims = 2:30, reduction.name = 'umap.ac')

obj <- FindMultiModalNeighbors(
  object = obj,
  reduction.list = list("lsi.me3", "lsi.ac", "pca"),
  dims.list = list(2:30, 2:30, 1:30)
)
obj <- RunUMAP(obj, nn.name = "weighted.nn", reduction.name = "umap.wnn")
obj <- FindClusters(obj, graph.name = "wsnn", algorithm = 3, resolution = 1)

Idents(obj) <- "seurat_clusters"

obj <- RenameIdents(obj, list("0" = "CD14+ Mono",
                              "1" = "CD14+ Mono",
                              "2" = "B cell",
                              "3" = "CD14+ Mono",
                              "4" = "CD4 T cell",
                              "5" = "CD16+ Mono",
                              "6" = "Late erythroid",
                              "7" = "CD8 T cell",
                              "8" = "B cell",
                              "9" = "NK",
                              "10" = "cDC"
                             )
)
obj$celltype <- Idents(obj)

obj <- TSSEnrichment(obj, fast = FALSE, assay = "me3")
obj <- TSSEnrichment(obj, fast = FALSE, assay = "ac")

saveRDS(object = obj, file = outfile)

R Seurat BSgenome.Hsapiens.UCSC.hg38 EnsDb.Hsapiens.v86 tximport Signac future From line 1 of pbmc_protein/process.R

library(Seurat)
library(Signac)
library(Rsamtools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ggplot2)
library(patchwork)


args <- commandArgs(trailingOnly = TRUE)
ac_peaks <- args[[1]]
me3_peaks <- args[[2]]

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

regions_ac <- read.table(ac_peaks, sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
regions_ac$peak <- "H3K27ac"

regions_me3 <- read.table(me3_peaks, sep = "\t",
                          col.names = c("chromosome", "start", "end", "name", "score",
                                        "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
regions_me3$peak <- "H3K27me3"

# regions_me3 <- read.table("data/pbmc_bulk/mapped/me3_peaks.broadPeak", sep = "\t",
#                           col.names = c("chromosome", "start", "end", "name", "score",
#                                         "strand", "a", "b" , "c"))
# regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
# regions_me3$peak <- "H3K27me3"
# regions_me3 <- keepStandardChromosomes(regions_me3, pruning.mode = "coarse")
# regions_me3 <- dropSeqlevels(regions_me3, value = "chrM", pruning.mode = "coarse")
# 
# regions_ac <- read.table("data/pbmc_bulk/mapped/ac_peaks.narrowPeak", sep = "\t",
#                           col.names = c("chromosome", "start", "end", "name", "score",
#                                         "strand", "a", "b" , "c", "d"))
# regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
# regions_ac$peak <- "H3K27ac"
# regions_ac <- keepStandardChromosomes(regions_ac, pruning.mode = "coarse")
# regions_ac <- dropSeqlevels(regions_ac, value = "chrM", pruning.mode = "coarse")

regions <- c(regions_me3, regions_ac)

QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / sum(results)) * 10^6
  return(cmp)
}

# get filtered fragment files
# single-cell
frags_me3 <- "data/pbmc_protein/bigwig/pbmc_me3.bed.gz"
frags_ac <- "data/pbmc_protein/bigwig/pbmc_ac.bed.gz"

# bulk_cell
bulk_me3 <- "data/pbmc_bulk/mapped/mono/me3.bed.gz"
bulk_ac <- "data/pbmc_bulk/mapped/mono/ac.bed.gz"

# quantify
ac_sc_cov <- QuantifyRegions(frags_ac, regions)
me_sc_cov <- QuantifyRegions(frags_me3, regions)

me_bulk_cov <- QuantifyRegions(bulk_me3, regions)
ac_bulk_cov <- QuantifyRegions(bulk_ac, regions)

# create matrix
mat <- cbind(ac_sc_cov, me_sc_cov, ac_bulk_cov, me_bulk_cov)
mat <- as.data.frame(mat)
mat$region <- regions$peak

# randomize order
mat <- mat[sample(nrow(mat)), ]

get_cod_expression <- function(m) {
    eq <- substitute(~~italic(R)^2~"="~r2, 
     list(r2 = format(summary(m)$r.squared, digits = 2)))
    cod <- as.expression(eq)
    return(cod)
}

ac_me3_sc <- ggplot(mat, aes(x = ac_sc_cov, y = me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  ylim(c(0, 600)) + xlim(c(0, 600)) +
  xlab("") +
  ylab("") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = mat))) +
  scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) +
  geom_abline(intercept = 0, slope = 1, linetype = 2) +
  theme(legend.position = "none")

ac_sc_ac_bulk <- ggplot(mat[mat$region=="H3K27ac", ], aes(x = ac_sc_cov, y = ac_bulk_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  xlab("H3K27ac (single-cell)") +
  ylab("H3K27ac (bulk)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ ac_bulk_cov, data = mat))) +
  scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) +
  theme(legend.position = "none")

ggsave(filename = "plots/pbmc/scatterplot_pbmc.png", plot = ac_me3_sc, height = 3.5, width = 3.5)

## BMMC
frags_me3_bmmc <- "data/bmmc_dual/bulk_fragments/me3.bed.gz"
frags_ac_bmmc <- "data/bmmc_dual/bulk_fragments/ac.bed.gz"

# quantify
me_sc_cov <- QuantifyRegions(frags_me3_bmmc, regions)
ac_sc_cov <- QuantifyRegions(frags_ac_bmmc, regions)

# create matrix
mat_bmmc <- cbind(ac_sc_cov, me_sc_cov)
mat_bmmc <- as.data.frame(mat_bmmc)
mat_bmmc$region <- regions$peak

mat_bmmc <- mat_bmmc[sample(nrow(mat_bmmc)), ]

ac_me3_sc_bmmc <- ggplot(mat_bmmc, aes(x = ac_sc_cov, y = me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  ylim(c(0, 600)) + xlim(c(0, 600)) +
  xlab("H3K27ac (multiplexed single-cell)") +
  scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) +
  ylab("H3K27me3 (multiplexed single-cell)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = mat_bmmc))) +
  theme(legend.position = "none") +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ggsave(filename = "plots/bmmc/scatterplot_bmmc.png", plot = ac_me3_sc_bmmc, height = 4, width = 4)

R ggplot2 tabix Seurat BSgenome.Hsapiens.UCSC.hg38 patcHwork Signac Rsamtools From line 1 of pbmc_protein/scatterplots.R

library(Signac)
library(Seurat)

obj <- readRDS("objects/pbmc_protein.rds")
Idents(obj) <- "celltype"

outdir <- "data/pbmc_protein/bigwig/"
celltypes.split <- levels(obj)

# split fragments for each assay
DefaultAssay(obj) <- "ac"
frags <- Fragments(obj)[[1]]@path
for (i in celltypes.split) {
    ct_safe <- gsub(" ", "_", i, fixed = TRUE)
    ct_safe <- gsub("+", "", ct_safe, fixed = TRUE)
    message(ct_safe)
    FilterCells(
      fragments = frags,
      cells = WhichCells(object = obj, idents = i),
      outfile = paste0(outdir, ct_safe, "_ac.bed.gz"),
      verbose = TRUE
    )
}

DefaultAssay(obj) <- "me3"
frags <- Fragments(obj)[[1]]@path
for (i in celltypes.split) {
    ct_safe <- gsub(" ", "_", i, fixed = TRUE)
    ct_safe <- gsub("+", "", ct_safe, fixed = TRUE)
    message(ct_safe)
    FilterCells(
      fragments = frags,
      cells = WhichCells(object = obj, idents = i),
      outfile = paste0(outdir, ct_safe, "_me3.bed.gz"),
      verbose = TRUE
    )
}

# remove non-cell fragments for each assay for genome bin quant
DefaultAssay(obj) <- "ac"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "pbmc_ac.bed.gz"),
  verbose = TRUE
)

DefaultAssay(obj) <- "me3"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "pbmc_me3.bed.gz"),
  verbose = TRUE
)

R Quant Seurat Signac From line 1 of pbmc_protein/split.R

shell:
    """
    cd data
    wget https://hgdownload-test.gi.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes
    """

SnakeMake From line 21 of master/Snakefile

shell:
    """
    cd genome/hg38
    aws s3 sync s3://stuart-genomes/hg38_analysis/bwa-mem2/ .
    aws s3 cp s3://stuart-genomes/hg38_analysis/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz .
    """

SnakeMake From line 33 of master/Snakefile

shell:
    """
    wget -i {input} -P data/ct_pro
    """

SnakeMake From line 47 of master/Snakefile

shell:
    """
    wget -i {input} -P data/mesc
    """

SnakeMake From line 57 of master/Snakefile

shell:
    """
    wget -i {input} -P data/henikoff
    wget https://github.com/Henikoff/scCUT-Tag/files/8647406/meta.csv -P data/henikoff
    wget https://github.com/Henikoff/scCUT-Tag/files/8653804/K27Ac_pbmc_meta.csv -P data/henikoff

    cd data/henikoff
    tabix -p bed GSM5034342_K27me3_R1_PBMC.fragments.HG38.tsv.gz
    tabix -p bed GSM5034343_K27me3_R2_PBMC.fragments.HG38.tsv.gz
    tabix -p bed GSM5034344_K27ac_PBMC.fragments.HG38.tsv.gz
    """

SnakeMake tabix From line 70 of master/Snakefile

shell:
    """
    Rscript code/CT_pro/update_objects.R
    """

SnakeMake From line 91 of master/Snakefile

shell:
    """
    cd data/multict/fragments
    aws s3 sync s3://multi-cuttag/fragments/ . --request-payer
    """

SnakeMake From line 100 of master/Snakefile

shell:
    """
    cd data/K562_ATAC
    wget https://www.encodeproject.org/files/ENCFF006OFA/@@download/ENCFF006OFA.bigBed
    wget https://www.encodeproject.org/files/ENCFF600FDO/@@download/ENCFF600FDO.bigWig
    wget https://www.encodeproject.org/files/ENCFF558BLC/@@download/ENCFF558BLC.bed.gz
    """

SnakeMake From line 109 of master/Snakefile

shell:
    """
    wget -i {input} -P data/bmmc_atac
    aws s3 sync s3://mpal-hg38/public/ ./data/bmmc_atac/ --request-payer
    """

SnakeMake From line 122 of master/Snakefile

shell:
    """
    aws s3 sync s3://bmmc-reference/public objects/ --request-payer
    """

SnakeMake From line 131 of master/Snakefile

shell:
    """
    wget -i {input} -P data/encode
    """

SnakeMake From line 144 of master/Snakefile

shell:
    """
    Rscript code/encode/combine_peaks.R
    """

SnakeMake From line 157 of master/Snakefile

shell:
    """
    cd data/pbmc_atac
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_fragments.tsv.gz
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_fragments.tsv.gz.tbi
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_singlecell.csv
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_filtered_peak_bc_matrix.h5
    """

SnakeMake From line 166 of master/Snakefile

shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.r1} \
        {input.r2} \
        | samtools sort -@ {threads} -O bam - \
        > data/HEK_K562_sc/mapped/{wildcards.mark}.bam

    samtools index data/HEK_K562_sc/mapped/{wildcards.mark}.bam

    sinto fragments \
      -b data/HEK_K562_sc/mapped/{wildcards.mark}.bam \
      -p {threads} \
      -f data/HEK_K562_sc/sinto/{wildcards.mark}.tmp \
      --barcode_regex "[^:]*"

    sort -k1,1 -k2,2n data/HEK_K562_sc/sinto/{wildcards.mark}.tmp > data/HEK_K562_sc/sinto/{wildcards.mark}.bed
    bgzip -@ {threads} data/HEK_K562_sc/sinto/{wildcards.mark}.bed
    tabix -p bed {output.frags}
    rm data/HEK_K562_sc/sinto/{wildcards.mark}.tmp
    """

SnakeMake SAMtools tabix Bwa-mem2 From line 186 of master/Snakefile

shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.r1} \
        {input.r2} \
        | samtools sort -@ {threads} -O bam - \
        > data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bam
    samtools index data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bam
    """

SnakeMake SAMtools Bwa-mem2 From line 217 of master/Snakefile

shell:
    """
    bamCoverage -b {input} -o {output} -p {threads} --normalizeUsing BPM
    """

SnakeMake DeepTools From line 232 of master/Snakefile

shell:
    """
    sinto fragments -p {threads} \
      -b {input} \
      --barcode_regex "[^:]*" \
      -f data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag

    sort -k1,1 -k2,2n data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag \
      > data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bed
    bgzip data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bed
    tabix -p bed {output}
    rm data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag
    """

SnakeMake tabix From line 241 of master/Snakefile

shell:
    """
    Rscript code/HEK_K562_sc/process.R {threads} {output.obj} data/HEK_K562_sc/sinto
    """

SnakeMake From line 265 of master/Snakefile

shell:
    """
    # me3
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.me3_r1_mono} \
        {input.me3_r2_mono} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/mono/me3.bam
    samtools index data/pbmc_bulk/mapped/mono/me3.bam

    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.me3_r1_plex} \
        {input.me3_r2_plex} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/plex/me3.bam
    samtools index data/pbmc_bulk/mapped/plex/me3.bam

    # ac
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.ac_r1_mono} \
        {input.ac_r2_mono} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/mono/ac.bam
    samtools index data/pbmc_bulk/mapped/mono/ac.bam

    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.ac_r1_plex} \
        {input.ac_r2_plex} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/plex/ac.bam
    samtools index data/pbmc_bulk/mapped/plex/ac.bam
    """

SnakeMake SAMtools Bwa-mem2 From line 288 of master/Snakefile

shell:
    """
    bamCoverage -b {input} -o {output} -p {threads} --normalizeUsing BPM
    """

SnakeMake DeepTools From line 329 of master/Snakefile

shell:
    """
    sinto fragments -p {threads} \
      -b {input} \
      --barcode_regex "[^:]*" \
      -f data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag

    sort -k1,1 -k2,2n data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag \
      > data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.bed
    bgzip data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.bed
    tabix -p bed {output}
    rm data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag
    """

SnakeMake tabix From line 338 of master/Snakefile

shell:
    """
    bedtools genomecov -i {input.me3} -g {input.chrom} -bg > data/ct_pro/H3K27me3.tmp
    sort -k1,1 -k2,2n data/ct_pro/H3K27me3.tmp > {output.me3_bg}
    bedGraphToBigWig {output.me3_bg} {input.chrom} {output.me3_bw}

    bedtools genomecov -i {input.ac} -g {input.chrom} -bg > data/ct_pro/H3K27ac.tmp
    sort -k1,1 -k2,2n data/ct_pro/H3K27ac.tmp > {output.ac_bg}
    bedGraphToBigWig {output.ac_bg} {input.chrom} {output.ac_bw}
    """

SnakeMake BEDTools bedGraphToBigWig From line 363 of master/Snakefile

shell:
    """
    wget -i {input} -P data/k562_peaks
    """

SnakeMake From line 382 of master/Snakefile

shell:
    """
    Rscript code/HEK_K562_bulk/collect_peaks.R
    """

SnakeMake From line 393 of master/Snakefile

shell:
    """
    python code/demux.py \
       --read1 {input.read1} \
       --read_i5 {input.readi5} \
       --read2 {input.read2} \
       --tn5 {input.bc} \
       --output {params.outdir}
    """

SnakeMake From line 411 of master/Snakefile

shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        data/{wildcards.exp}/ntt/out/{wildcards.mk}.R1.fastq \
        data/{wildcards.exp}/ntt/out/{wildcards.mk}.R2.fastq \
        | samtools sort -@ {threads} -O bam - \
        > data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam

    samtools index data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam

    sinto fragments -p {threads} \
      -b data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam \
      --barcode_regex "[^:]*" \
      -f data/{wildcards.exp}/ntt/{wildcards.mk}.frag

    sort -k1,1 -k2,2n data/{wildcards.exp}/ntt/{wildcards.mk}.frag > data/{wildcards.exp}/ntt/{wildcards.mk}.tsv

    rm data/{wildcards.exp}/ntt/{wildcards.mk}.frag
    bgzip -@ {threads} data/{wildcards.exp}/ntt/{wildcards.mk}.tsv
    tabix -p bed {output}
    """

SnakeMake SAMtools tabix Bwa-mem2 From line 428 of master/Snakefile

shell:
    """
    salmon index -t {input} -i {output} --features -k7
    """

SnakeMake Salmon From line 456 of master/Snakefile

shell:
    """
    paste <(gunzip -c {input.r1}) <(gunzip -c {input.r3}) \
      | paste - - - - \
      | awk -F'\\t' -v one="data/pbmc_protein/adt/read1.fastq" '{{ OFS="\\n"; print $1,$3$4,$5,$7$8 >> one;}}'

    pigz -p {threads} data/pbmc_protein/adt/read1.fastq
    """

SnakeMake From line 468 of master/Snakefile

shell:
    """
    salmon alevin -l ISR -i {input.index} \
        -o data/pbmc_protein/adt/outs \
        -p {threads} \
        --naiveEqclass \
        --keepCBFraction 0.8 \
        -1 data/pbmc_protein/adt/TAG_S2_R2_001.fastq.gz \
        -2 {input.reads} \
        --bc-geometry 1[1-16] \
        --umi-geometry 2[1-10] \
        --read-geometry 2[71-85] \
        --tgMap data/adt_map_totalseq_a.tsv
    """

SnakeMake Salmon From line 484 of master/Snakefile

shell:
    """
    cd data/bmmc_dual/DNA
    cutadapt -j {threads} -u 14 -o K27ac_R2.fastq scBM_k27ac_S2_R2_001.fastq.gz
    cutadapt -j {threads} -u 14 -o K27me_R2.fastq scBM_k27me_S1_R2_001.fastq.gz

    gzip -d scBM_k27ac_S2_R1_001.fastq.gz scBM_k27ac_S2_R3_001.fastq.gz
    gzip -d scBM_k27me_S1_R1_001.fastq.gz scBM_k27me_S1_R3_001.fastq.gz        

    sinto barcode \
      --barcode_fastq K27ac_R2.fastq \
      --read1 scBM_k27ac_S2_R1_001.fastq \
      --read2 scBM_k27ac_S2_R3_001.fastq \
      -b 16

    sinto barcode \
      --barcode_fastq K27me_R2.fastq \
      --read1 scBM_k27me_S1_R1_001.fastq \
      --read2 scBM_k27me_S1_R3_001.fastq \
      -b 16

    pigz -p {threads} *.fastq
    mkdir barcoded
    mv scBM_k27me_S1_R1_001.barcoded.fastq.gz scBM_k27me_S1_R3_001.barcoded.fastq.gz ./barcoded/
    mv scBM_k27ac_S2_R1_001.barcoded.fastq.gz scBM_k27ac_S2_R3_001.barcoded.fastq.gz ./barcoded/
    """

SnakeMake Cutadapt From line 504 of master/Snakefile

shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        data/bmmc_dual/DNA/barcoded/scBM_k27me_S1_R1_001.barcoded.fastq.gz \
        data/bmmc_dual/DNA/barcoded/scBM_k27me_S1_R3_001.barcoded.fastq.gz \
        | samtools sort -@ {threads} -O bam - \
        > data/bmmc_dual/DNA/H3K27me3.bam

    bwa-mem2 mem {input.genome} \
        -t {threads} \
        data/bmmc_dual/DNA/barcoded/scBM_k27ac_S2_R1_001.barcoded.fastq.gz \
        data/bmmc_dual/DNA/barcoded/scBM_k27ac_S2_R3_001.barcoded.fastq.gz \
        | samtools sort -@ {threads} -O bam - \
        > data/bmmc_dual/DNA/H3K27ac.bam

    samtools index data/bmmc_dual/DNA/H3K27ac.bam
    samtools index data/bmmc_dual/DNA/H3K27me3.bam

    sinto fragments -p {threads} \
      -b data/bmmc_dual/DNA/H3K27ac.bam \
      --barcode_regex "[^:]*" \
      -f data/bmmc_dual/DNA/H3K27ac.frag

    sinto fragments -p {threads} \
      -b data/bmmc_dual/DNA/H3K27me3.bam \
      --barcode_regex "[^:]*" \
      -f data/bmmc_dual/DNA/H3K27me3.frag

    sort -k1,1 -k2,2n data/bmmc_dual/DNA/H3K27ac.frag > data/bmmc_dual/DNA/H3K27ac.bed
    sort -k1,1 -k2,2n data/bmmc_dual/DNA/H3K27me3.frag > data/bmmc_dual/DNA/H3K27me3.bed

    rm data/bmmc_dual/DNA/H3K27ac.frag data/bmmc_dual/DNA/H3K27me3.frag
    bgzip -@ {threads} data/bmmc_dual/DNA/H3K27ac.bed
    bgzip -@ {threads} data/bmmc_dual/DNA/H3K27me3.bed
    tabix -p bed {output.ac}
    tabix -p bed {output.me}
    """

SnakeMake SAMtools tabix Bwa-mem2 From line 540 of master/Snakefile

shell:
    """
    Rscript code/bmmc_atac/process_bmmc_atac.r
    """

SnakeMake From line 584 of master/Snakefile

shell:
    """
    Rscript code/henikoff/process.R
    """

SnakeMake From line 598 of master/Snakefile

shell:
    """
    Rscript code/HEK_K562_bulk/analysis.R
    """

SnakeMake From line 616 of master/Snakefile

shell:
    """
    gzip -dc {input.me3_peaks} > data/k562_peaks/ENCFF031FSF.bed
    gzip -dc {input.ac_peaks} > data/k562_peaks/ENCFF038DDS.bed

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \
      {input.mono_ac} {input.mono_me3} {input.mono_rna} \
      -R data/k562_peaks/ENCFF049HUP.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/HEK_K562_bulk/h3k27me3.mat.gz

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \
      {input.mono_ac} {input.mono_me3} {input.mono_rna} \
      -R data/k562_peaks/ENCFF038DDS.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/HEK_K562_bulk/h3k27ac.mat.gz

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \
      {input.mono_ac} {input.mono_me3} {input.mono_rna} \
      -R {input.rna_peaks} \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/HEK_K562_bulk/rna.mat.gz

    plotHeatmap \
      -m data/HEK_K562_bulk/h3k27me3.mat.gz  \
      -o {output.me3} \
      --whatToShow 'heatmap and colorbar' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767'

    plotHeatmap \
      -m data/HEK_K562_bulk/h3k27ac.mat.gz  \
      -o {output.ac} \
      --zMax 0.5 \
      --whatToShow 'heatmap and colorbar' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767'

    plotHeatmap \
      -m data/HEK_K562_bulk/rna.mat.gz  \
      -o {output.rna} \
      --zMax 0.5 \
      --whatToShow 'heatmap and colorbar' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767'
    """

SnakeMake DeepTools From line 637 of master/Snakefile

shell:
    """
    Rscript code/HEK_K562_sc/analysis.R
    """

SnakeMake From line 697 of master/Snakefile

shell:
    """
    Rscript code/HEK/K562_bulk/quantify_regions.R
    """

SnakeMake From line 706 of master/Snakefile

shell:
    """
    Rscript code/HEK_K562_sc/mct_scatter.R
    """

SnakeMake From line 719 of master/Snakefile

shell:
    """
    Rscript code/pbmc_atac/process.R
    """

SnakeMake From line 730 of master/Snakefile

shell:
    """
    Rscript code/pbmc_protein/process.R {threads} {input.adt} {input.ac} {input.me} {output}
    """

SnakeMake From line 742 of master/Snakefile

shell:
    """
    Rscript code/pbmc_sc/process.R {threads} {input.ac} {input.me} {output}
    """

SnakeMake From line 753 of master/Snakefile

shell:
    """
    # split fragment files
    Rscript code/pbmc_protein/split.R

    # create bigwig
    cd data/pbmc_protein/bigwig
    for fragfile in $(ls -d *.bed.gz);do
      fname=(${{fragfile//.bed.gz/ }})
      bedtools genomecov -i $fragfile -g ../../../{input.chrom} -bg > "${{fname}}.bg"
      bedGraphToBigWig "${{fname}}.bg" ../../../{input.chrom} "${{fname}}.bw"
    done
    """

SnakeMake BEDTools bedGraphToBigWig From line 770 of master/Snakefile

shell:
    """
    # split fragment files
    Rscript code/pbmc_sc/split.R

    # create bigwig
    cd data/pbmc_sc/ntt
    for fragfile in $(ls -d *.bed.gz);do
      fname=(${{fragfile//.bed.gz/ }})
      bedtools genomecov -i $fragfile -g ../../../{input.chrom} -bg > "${{fname}}.bg"
      bedGraphToBigWig "${{fname}}.bg" ../../../{input.chrom} "${{fname}}.bw"
    done
    """

SnakeMake BEDTools bedGraphToBigWig From line 795 of master/Snakefile

shell:
    """
    Rscript code/bmmc_dual/split.R
    """

SnakeMake From line 814 of master/Snakefile

shell:
    """
    multiBigwigSummary BED-file -b \
      {input.me3_sc} \
      {input.ac_sc} \
      {input.me3_bulk_mono} \
      {input.me3_bulk_plex} \
      {input.ac_bulk_mono} \
      {input.ac_bulk_plex} \
      {input.me3_chip} \
      {input.ac_chip} \
      {input.ctp_ac_bw} \
      {input.ctp_me3_bw} \
      --BED {input.all_peaks} \
      -p {threads} \
      --outRawCounts {output} \
      -o data/pbmc_bulk/cor_matrix_all.npz
    """

SnakeMake From line 834 of master/Snakefile

shell:
    """
    multiBigwigSummary BED-file -b \
      {input.me3_1} \
      {input.ac_1} \
      {input.me3_2} \
      {input.ac_2} \
      --BED {input.all_peaks} \
      -p {threads} \
      --outRawCounts {output} \
      -o data/pbmc_protein/replicate_cor_matrix_all.npz
    """

SnakeMake From line 862 of master/Snakefile

shell:
    """
    Rscript code/pbmc_bulk/encode_cor_all.R
    """

SnakeMake From line 879 of master/Snakefile

shell:
    """
    # ac cor
    multiBigwigSummary BED-file -b \
      data/pbmc_protein/bigwig/B_cell_ac.bw \
      data/pbmc_protein/bigwig/NK_ac.bw \
      data/pbmc_protein/bigwig/CD14_Mono_ac.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_ac.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_ac.bw \
      data/pbmc_protein/bigwig/Late_erythroid_ac.bw \
      data/encode/ENCFF293ETP.bigWig \
      data/encode/ENCFF611GRL.bigWig \
      data/encode/ENCFF181OXO.bigWig \
      data/encode/ENCFF526VJO.bigWig \
      data/encode/ENCFF601NLG.bigWig \
      data/encode/ENCFF951ZBV.bigWig \
      data/encode/ENCFF064JOI.bigWig \
      --BED {input.ac} \
      -p {threads} \
      --outRawCounts data/pbmc_protein/bigwig/raw_ac.tsv \
      -o data/pbmc_protein/bigwig/cor_matrix_ac.npz

    # me3 cor
    multiBigwigSummary BED-file -b \
      data/pbmc_protein/bigwig/B_cell_me3.bw \
      data/pbmc_protein/bigwig/NK_me3.bw \
      data/pbmc_protein/bigwig/CD14_Mono_me3.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_me3.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_me3.bw \
      data/pbmc_protein/bigwig/Late_erythroid_me3.bw \
      data/encode/ENCFF569IPX.bigWig \
      data/encode/ENCFF842JLZ.bigWig \
      data/encode/ENCFF811VAX.bigWig \
      data/encode/ENCFF499VWN.bigWig \
      data/encode/ENCFF777EGG.bigWig \
      data/encode/ENCFF046VLL.bigWig \
      data/encode/ENCFF085YMB.bigWig \
      --BED {input.me3} \
      -p {threads} \
      --outRawCounts data/pbmc_protein/bigwig/raw_me3.tsv \
      -o data/pbmc_protein/bigwig/cor_matrix_me3.npz

    # all cor
    multiBigwigSummary BED-file -b \
      data/pbmc_protein/bigwig/B_cell_me3.bw \
      data/pbmc_protein/bigwig/NK_me3.bw \
      data/pbmc_protein/bigwig/CD14_Mono_me3.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_me3.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_me3.bw \
      data/pbmc_protein/bigwig/Late_erythroid_me3.bw \
      data/pbmc_protein/bigwig/B_cell_ac.bw \
      data/pbmc_protein/bigwig/NK_ac.bw \
      data/pbmc_protein/bigwig/CD14_Mono_ac.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_ac.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_ac.bw \
      data/pbmc_protein/bigwig/Late_erythroid_ac.bw \
      data/encode/ENCFF569IPX.bigWig \
      data/encode/ENCFF842JLZ.bigWig \
      data/encode/ENCFF811VAX.bigWig \
      data/encode/ENCFF499VWN.bigWig \
      data/encode/ENCFF777EGG.bigWig \
      data/encode/ENCFF046VLL.bigWig \
      data/encode/ENCFF085YMB.bigWig \
      data/encode/ENCFF293ETP.bigWig \
      data/encode/ENCFF611GRL.bigWig \
      data/encode/ENCFF181OXO.bigWig \
      data/encode/ENCFF526VJO.bigWig \
      data/encode/ENCFF601NLG.bigWig \
      data/encode/ENCFF951ZBV.bigWig \
      data/encode/ENCFF064JOI.bigWig \
      --BED data/encode/all.bed \
      -p {threads} \
      --outRawCounts data/pbmc_protein/bigwig/raw.tsv \
      -o data/pbmc_protein/bigwig/cor_matrix_all.npz
    """

SnakeMake From line 896 of master/Snakefile

shell:
    """
    Rscript code/pbmc_protein/encode_cor.R
    """

SnakeMake From line 982 of master/Snakefile

shell:
    """
    Rscript code/pbmc_bulk/analysis.R
    """

SnakeMake From line 994 of master/Snakefile

shell:
    """
    Rscript code/pbmc_bulk/frip.R
    """

SnakeMake From line 1008 of master/Snakefile

shell:
    """
    Rscript code/pbmc_protein/frip.R
    """

SnakeMake From line 1024 of master/Snakefile

shell:
    """
    Rscript code/bmmc_dual/frip_bmmc.R
    """

SnakeMake From line 1035 of master/Snakefile

shell:
    """
    gzip -dc {input.me3_peaks} > data/encode/ENCFF291LVP.bed
    gzip -dc {input.ac_peaks} > data/encode/ENCFF832RWT.bed

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.mono_ac} {input.mono_me3} \
      -R data/encode/ENCFF291LVP.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/pbmc_bulk/h3k27me3.mat.gz

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.mono_ac} {input.mono_me3} \
      -R data/encode/ENCFF832RWT.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/pbmc_bulk/h3k27ac.mat.gz

    plotHeatmap \
      -m data/pbmc_bulk/h3k27me3.mat.gz \
      -o {output.me3} \
      --whatToShow 'heatmap only' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#676767' '#FFFFFF,#676767'

    plotHeatmap \
      -m data/pbmc_bulk/h3k27ac.mat.gz \
      -o {output.ac} \
      --whatToShow 'heatmap only' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#676767' '#FFFFFF,#676767'
    """

SnakeMake DeepTools From line 1052 of master/Snakefile

shell:
    """
    Rscript code/pbmc_protein/scatterplots.R {input.ac_peaks} {input.me3_peaks}
    """

SnakeMake From line 1100 of master/Snakefile

shell:
    """
    Rscript code/pbmc_sc/scatterplots_sc.R {input.ac_peaks} {input.me3_peaks}
    """

SnakeMake From line 1115 of master/Snakefile

shell:
    """
    Rscript code/bmmc_dual/process.R {threads} {output}
    """

SnakeMake From line 1126 of master/Snakefile

shell:
    """
    Rscript code/bmmc_dual/trajectory.R
    """

SnakeMake From line 1139 of master/Snakefile

shell:
    """
    Rscript code/bmmc_dual/transfer.R
    """

SnakeMake From line 1147 of master/Snakefile

shell:
    """
    Rscript code/bmmc_dual/analysis.R
    """

SnakeMake From line 1155 of master/Snakefile

shell:
    """
    Rscript code/pbmc_protein/analysis.R {input.ntt} {input.ct_ac} {input.ct_me3} {input.ntt_2}
    """

SnakeMake From line 1169 of master/Snakefile

shell:
    """
    Rscript code/pbmc_bulk/quantify_regions.R
    """

SnakeMake From line 1184 of master/Snakefile

Nanobody-tethered transposition followed by sequencing

Help improve this workflow!

Installing dependencies

Running the workflow

Data availability

Plasmid availability

Citation

Code Snippets

Comments

Support

Free

Related Workflows

public

public

public

public

public

public