Nanobody-tethered transposition followed by sequencing

public public 1yr ago 0 bookmarks

This repository contains code needed to reproduce the analyses shown in Stuart et al. (2022), Nature Biotechnology

More information about NTT-seq can be found at the NTT-seq website

Installing dependencies

All the required dependencies needed to run the workflow can be installed automatically by creating a new conda environment.

First ensure that conda or mamba is installed and available.

To create a new environment with the dependencies installed, run:

# using conda
conda env create -f environment.yaml
# using mamba
mamba env create -f environment.yaml

Running the workflow

This workflow involves downloading data from AWS using the AWS command line tools. To enable the download, you will first need to create an AWS account and set up the AWS command line tools by running aws configure . Note that some of the data downloaded may incur charges from AWS .

To run the Snakemake workflow, first activate the conda environment containing the required dependencies:

conda activate ntt

Next, run snakemake with the desired options. Setting the -j parameter controls the maximum number of cores used by the workflow:

snakemake -j 24

See the snakemake documentation for a complete list of available options.

Data availability

Processed datasets from this study are available from:

Plasmid availability

Plasmids generated from this study are available from AddGene :

Citation

@ARTICLE{Stuart2022,
 title = "Nanobody-tethered transposition enables multifactorial chromatin
 profiling at single-cell resolution",
 author = "Stuart, Tim and Hao, Stephanie and Zhang, Bingjie and
 Mekerishvili, Levan and Landau, Dan A and Maniatis, Silas and
 Satija, Rahul and Raimondi, Ivan",
 journal = "Nat. Biotechnol.",
 month = dec,
 year = 2022,
 url = "http://dx.doi.org/10.1038/s41587-022-01588-5",
 language = "en"
}

Code Snippets

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
library(Signac)
library(Seurat)
library(SummarizedExperiment)
library(future)
library(GenomicRanges)
library(EnsDb.Hsapiens.v86)

plan("multicore", workers = 8)
options(future.globals.maxSize = 100 * 1024 ^ 3)

### Load processed object for the metadata
orig.obj <- readRDS("data/bmmc_atac/scATAC-Healthy-Hematopoiesis-191120.rds")
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"

# get all fragment files
frag.files <- list.files("data/bmmc_atac/", pattern = "^scATAC_", full.names = TRUE)
frag.files <- paste0(frag.files, "/fragments.tsv.gz")
names(frag.files) <-  list.files("data/bmmc_atac/", pattern = "^scATAC_")

# call peaks on each and combine
pk.all <- list()
for (i in seq_along(frag.files)) {
  pks <- CallPeaks(
    object = frag.files[[i]]
  )
  pk.all[[i]] <- pks
}

gr.combined <- Reduce(f = c, x = pk.all)
gr.combined <- GenomicRanges::reduce(gr.combined)
gr.combined <- keepStandardChromosomes(gr.combined, pruning.mode = "coarse")

# quantify and create objects
md <- colData(orig.obj)

obj.list <- list()
for (i in seq_along(frag.files)) {
  obj_ident <- gsub(pattern = "scATAC_", x = names(frag.files)[[i]], replacement = "")
  cells.keep <- md[md$Group == obj_ident, "Barcode"]
  frags <- CreateFragmentObject(
    path = frag.files[[i]],
    cells = cells.keep
  )
  peakcounts <- FeatureMatrix(
    fragments = frags,
    features = gr.combined,
    cells = cells.keep
  )
  obj <- CreateChromatinAssay(counts = peakcounts, fragments = frags)
  obj <- CreateSeuratObject(counts = obj, assay = "ATAC")
  obj <- RenameCells(obj, add.cell.id = obj_ident)
  obj.list[[i]] <- obj
}

rownames(md) <- paste0(md$Group, "_", md$Barcode)
md <- as.data.frame(md)
bmmc <- merge(x = obj.list[[1]], y = obj.list[2:length(obj.list)])
bmmc <- AddMetaData(object = bmmc, metadata = md)
Annotation(bmmc) <- annot

bmmc$tissue <- sapply(strsplit(bmmc$Group, "_"), `[[`, 1)
bmmc$sample <- sapply(strsplit(bmmc$Group, "_"), `[[`, 2)

# remove PBMC
Idents(bmmc) <- "tissue"
bmmc <- subset(bmmc, idents = "PBMC", invert = TRUE)

bmmc <- NucleosomeSignal(bmmc)

# process
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc)
bmmc <- RunSVD(bmmc)
bmmc <- RunUMAP(bmmc, reduction = "lsi", dims = 2:40)
ga <- GeneActivity(bmmc)
bmmc[["GA"]] <- CreateAssayObject(counts = ga)

DefaultAssay(bmmc) <- "GA"
bmmc <- NormalizeData(bmmc)

DefaultAssay(bmmc) <- "ATAC"

# integrate embeddings
atac <- SplitObject(bmmc, split.by = "Group")
atac <- lapply(atac, RunSVD, scale.embeddings = FALSE)

atac_anchors <- FindIntegrationAnchors(
  object.list = atac,
  reduction = "rlsi",
  dims = 2:30
)

atac_integrated <- IntegrateEmbeddings(
  anchorset = atac_anchors,
  new.reduction.name = "integrated_lsi",
  reductions = bmmc[["lsi"]],
  dims.to.integrate = 1:30
)
bmmc[["integrated_lsi"]] <- atac_integrated[["integrated_lsi"]]

saveRDS(object = bmmc, file = "objects/bmmc_atac.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
library(Seurat)
library(Signac)
library(ggplot2)
library(patchwork)

bmmc <- readRDS("objects/bmmc_dual.rds")

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

p1 <- DimPlot(bmmc, reduction = "umap.me3", group.by = "celltype", pt.size = 0.1, label = TRUE, label.size = 3, repel = TRUE) +
  theme_void() +
  theme(text = element_text(size = 10), legend.position = "none") +
  ggtitle("H3K27me3")
p2 <- DimPlot(bmmc, reduction = "umap.ac", group.by = "celltype", pt.size = 0.1, label = TRUE, label.size = 3, repel = TRUE) +
  theme_void() +
  theme(text = element_text(size = 10), legend.position = "none") +
  ggtitle("H3K27ac") +
  NoLegend()
p3 <- DimPlot(bmmc, reduction = "umap.wnn", group.by = "celltype", label = TRUE, label.size = 4, repel = TRUE) + 
  ggtitle("BMMCs: H3K27me3 + H3K27ac") +
  theme_classic() +
  NoLegend() +
  ylab('UMAP 2') + xlab("UMAP 1")

pp <- ((p1 / p2) | p3) + plot_layout(widths = c(1, 2))
ggsave(filename = "plots/bmmc/dimplots_bmmc.png", plot = pp, height = 6, width = 8)

# fragment counts
fm <- CountFragments(
  fragments = Fragments(bmmc[["me3"]])[[1]]@path,
  cells = colnames(bmmc)
)

fa <- CountFragments(
  fragments = Fragments(bmmc[["ac"]])[[1]]@path,
  cells = colnames(bmmc)
)

mean(fm$frequency_count) # 1217
mean(fa$frequency_count) # 326
sd(fm$frequency_count)  # 1274
sd(fa$frequency_count)  # 334

# create plot
fm$mark <- "H3K27me3"
fa$mark <- "H3K27ac"
all_counts <- rbind(fm, fa)

p0 <- ggplot(all_counts, aes(mark, frequency_count, fill = mark)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  ylab("Fragments") +
  xlab("Mark") +
  ggtitle("Total fragments") +
  theme(legend.position = "none") +
  scale_fill_manual(values = c(colormap$H3K27ac, colormap$H3K27me3))

ggsave(filename = "plots/bmmc/fragments_bmmc.png", plot = p0, height = 3, width = 3)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
library(Signac)
library(Seurat)
library(GenomicRanges)
library(ggplot2)


bmmc <- readRDS("objects/bmmc_dual.rds")

# ENCODE peaks
pk_ac <- read.table("data/encode/ENCFF832RWT.bed.gz")
pk_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz")

pk_ac <- makeGRangesFromDataFrame(df = pk_ac, seqnames.field = "V1", start.field = "V2", end.field = "V3")
pk_me3 <- makeGRangesFromDataFrame(df = pk_me3, seqnames.field = "V1", start.field = "V2", end.field = "V3")

t_bmmc_ac <- CountFragments(fragments = Fragments(bmmc[['ac']])[[1]]@path, cells = colnames(bmmc))
totals_bmmc_ac <- t_bmmc_ac$frequency_count
names(totals_bmmc_ac) <- t_bmmc_ac$CB

t_bmmc_me3 <- CountFragments(fragments = Fragments(bmmc[['me3']])[[1]]@path, cells = colnames(bmmc))
totals_bmmc_me3 <- t_bmmc_me3$frequency_count
names(totals_bmmc_me3) <- t_bmmc_me3$CB

# total counts in peak regions
get_total <- function(obj, regions) {
  counts <- FeatureMatrix(
    fragments = Fragments(obj),
    features = regions,
    cells = colnames(obj)
  )
  return(colSums(counts))
}

# ntt
in_peak_bmmc_ac_ac <- get_total(obj = bmmc[['ac']], regions = pk_ac)
in_peak_bmmc_ac_me <- get_total(obj = bmmc[['ac']], regions = pk_me3)
in_peak_bmmc_me_me <- get_total(obj = bmmc[['me3']], regions = pk_me3)
in_peak_bmmc_me_ac <- get_total(obj = bmmc[['me3']], regions = pk_ac)

# frip
make_df <- function(totals, inpeak, assay, pk, dataset) {
  d <- data.frame(
    cell = names(totals),
    total = totals,
    in_peak = inpeak[names(totals)],
    assay = assay,
    peak = pk,
    dataset = dataset
  )
  d$FRIP <- d$in_peak / d$total
  return(d)
}

df <- rbind(
  make_df(totals_bmmc_ac, in_peak_bmmc_ac_ac, "H3K27ac", "H3K27ac", "NTT"),
  make_df(totals_bmmc_ac, in_peak_bmmc_ac_me, "H3K27ac", "H3K27me3", "NTT"),
  make_df(totals_bmmc_me3, in_peak_bmmc_me_me, "H3K27me3", 'H3K27me3', "NTT"),
  make_df(totals_bmmc_me3, in_peak_bmmc_me_ac, "H3K27me3", "H3K27ac", "NTT")
)

df_filt <- df[df$assay == df$peak, ]

p <- ggplot(df_filt, aes(x = peak, y = FRIP, fill = peak)) +
  geom_boxplot(outlier.size = 0.2) +
  scale_fill_manual(values = c("#F98401", "#D3145A")) +
  theme_bw() +
  ylab("FRiP") +
  xlab("Assay") +
  ggtitle("FRiP") +
  theme(legend.position = "none")

ggsave("plots/bmmc/frip_bmmc.png", plot = p, height = 4, width = 3)
saveRDS(object = df_filt, file = "data/bmmc_dual/frip.rds")

mean(df_filt[df_filt$peak == "H3K27me3", "FRIP" ]) # 0.18
sd(df_filt[df_filt$peak == "H3K27me3", "FRIP" ]) # 0.09
mean(df_filt[df_filt$peak == "H3K27ac", "FRIP" ]) # 0.26
sd(df_filt[df_filt$peak == "H3K27ac", "FRIP" ]) # 0.09
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
library(Seurat)
library(Signac)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(future)
library(GenomicRanges)


args <- commandArgs(trailingOnly = TRUE)
threads <- args[[1]]
outfile <- args[[2]]

plan("multicore", workers = as.integer(threads))
options(future.globals.maxSize = Inf)

# annotation
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"
genome.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:23]

fpath <- "data/bmmc_dual/DNA"
fragfiles <- list.files(
  path = fpath,
  pattern = "*.bed.gz$",
  full.names = TRUE
)
fname <- gsub(pattern = paste0(fpath, "/"),
              replacement = "",
              x = fragfiles)
fname <- gsub(pattern = ".bed.gz",
              replacement = "",
              x = fname)
names(fragfiles) <- fname

total_counts <- CountFragments(fragments = as.list(fragfiles))
cells.keep <- total_counts[total_counts$frequency_count > 150, "CB"]

frags_me3 <- CreateFragmentObject(
  path = fragfiles[['H3K27me3']],
  cells = cells.keep
)
frags_ac <- CreateFragmentObject(
  path = fragfiles[['H3K27ac']],
  cells = cells.keep
)

counts_me3 <- AggregateTiles(
  object = frags_me3,
  cells = cells.keep,
  min_counts = 1,
  binsize = 5000,
  genome = genome.use
)

counts_ac <- AggregateTiles(
  object = frags_ac,
  cells = cells.keep,
  min_counts = 3,
  binsize = 1000,
  genome = genome.use
)

# remove features overlapping blacklist
remove_blacklist <- function(counts) {
  olap <- findOverlaps(query = StringToGRanges(rownames(counts)), subject = blacklist_hg38_unified)
  is_bl <- queryHits(olap)
  counts <- counts[setdiff(1:nrow(counts), is_bl), ]
  return(counts)
}

# create seurat object
me3 <- CreateChromatinAssay(
  counts = remove_blacklist(counts_me3),
  fragments = fragfiles[['H3K27me3']],
  annotation = annot
)

ac <- CreateChromatinAssay(
  counts = remove_blacklist(counts_ac),
  fragments = fragfiles[['H3K27ac']],
  annotation = annot
)

bmmc <- CreateSeuratObject(counts = me3, assay = "me3")
bmmc[['ac']] <- ac

bmmc <- subset(bmmc, subset = nCount_me3 < 10000 &
                 nCount_ac < 10000 &
                 nCount_me3 > 100 &
                 nCount_ac > 75)

bmmc <- TSSEnrichment(bmmc, assay = "me3")
bmmc$tss.me3 <- bmmc$TSS.enrichment
bmmc <- TSSEnrichment(bmmc, assay = "ac")
bmmc$tss.ac <- bmmc$TSS.enrichment

bmmc <- NucleosomeSignal(bmmc, assay = "me3")
bmmc$nucleosome.me3 <- bmmc$nucleosome_signal
bmmc <- NucleosomeSignal(bmmc, assay = "ac")
bmmc$nucleosome.ac <- bmmc$nucleosome_signal

bmmc <- subset(bmmc, subset = tss.ac > 1)

DefaultAssay(bmmc) <- "me3"
feat.keep <- names(which(rowSums(bmmc, slot = 'counts') > 1)) # remove low-count features
bmmc[['me3']] <- subset(bmmc[['me3']], features = feat.keep)
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc, scale.factor = median(bmmc$nCount_me3))
bmmc <- RunSVD(bmmc, reduction.name = "lsi.me3", n = 100, scale.embeddings = FALSE)
bmmc <- RunUMAP(bmmc, reduction = "lsi.me3", reduction.name = "umap.me3", dims = 2:100)

DefaultAssay(bmmc) <- "ac"
feat.keep <- names(which(rowSums(bmmc, slot = 'counts') > 1))
bmmc[['ac']] <- subset(bmmc[['ac']], features = feat.keep)
bmmc <- FindTopFeatures(bmmc)
bmmc <- RunTFIDF(bmmc, scale.factor = median(bmmc$nCount_ac))
bmmc <- RunSVD(bmmc, reduction.name = "lsi.ac", n = 100, scale.embeddings = FALSE)
bmmc <- RunUMAP(bmmc, reduction = "lsi.ac", reduction.name = "umap.ac", dims = 2:60)

bmmc <- FindMultiModalNeighbors(
  object = bmmc,
  reduction.list = list("lsi.me3", "lsi.ac"),
  dims.list = list(2:100, 2:60)
)
bmmc <- RunUMAP(bmmc, nn.name = "weighted.nn", reduction.name = "umap.wnn")
bmmc <- FindClusters(bmmc, graph.name = "wsnn", algorithm = 3, resolution = 4)

renaming <- list(
  '0' = "NK",
  '1' = "NK",
  '2' = "GMP/CLP",
  '3' = 'CD8 Naive',
  '4' = 'CD4 Naive',
  '5' = 'CD14 Mono',
  '6' = 'pDC',
  '7' = 'B',
  '8' = 'CD4 Memory',
  '9' = 'CD8 Memory',
  '10' = 'CD14 Mono',
  '11' = 'Pre-B',
  '12' = 'CD8 Memory',
  '13' = 'Late erythroid',
  '14' = 'Early erythroid',
  '15' = 'CD14 Mono',
  '16' = 'B',
  '17' = 'HSPC',
  '18' = 'CD14 Mono',
  '19' = 'CD14 Mono',
  '20' = 'Plasma',
  '21' = 'Early basophil'
)
Idents(bmmc) <- "seurat_clusters"
bmmc <- RenameIdents(bmmc, renaming)
bmmc$celltype <- Idents(bmmc)

saveRDS(object = bmmc, file = "objects/bmmc_dual.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
library(Signac)
library(Seurat)

obj <- readRDS("objects/bmmc_dual.rds")
outdir <- "data/bmmc_dual/bulk_fragments/"

# remove non-cell fragments for each assay for genome bin quant
DefaultAssay(obj) <- "ac"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "ac.bed.gz"),
  verbose = TRUE
)

DefaultAssay(obj) <- "me3"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "me3.bed.gz"),
  verbose = TRUE
)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
library(Signac)
library(Seurat)
library(ggplot2)
library(Matrix)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomeInfoDb)
library(RColorBrewer)
library(RcppRoll)

if (!requireNamespace("SeuratWrappers", quietly = TRUE)) {
  remotes::install_github('satijalab/seurat-wrappers')
}

library(SeuratWrappers)

if (!requireNamespace("monocle3", quietly = TRUE)) {
  setRepositories(ind=1:2)
  remotes::install_github('cole-trapnell-lab/leidenbase')
  remotes::install_github('cole-trapnell-lab/monocle3')
}

library(monocle3)
library(future)
plan("multicore", workers = 12)
options(future.globals.maxSize = Inf)


bmmc <- readRDS("objects/bmmc_dual.rds")
Idents(bmmc) <- "celltype"
cells.use <- c("HSPC", "GMP/CLP", "Pre-B", "B", "Plasma")

bcell <- bmmc[, bmmc$celltype %in% cells.use]

DefaultAssay(bcell) <- "me3"
feat.keep <- names(which(rowSums(bcell, slot = 'counts') > 1)) # remove low-count features
bcell[['me3']] <- subset(bcell[['me3']], features = feat.keep)
bcell <- FindTopFeatures(bcell, min.cutoff = 2)
bcell <- RunTFIDF(bcell, scale.factor = median(bcell$nCount_me3))
bcell <- RunSVD(bcell, reduction.name = "lsi.me3", scale.embeddings = FALSE)
bcell <- RunUMAP(bcell, reduction = "lsi.me3", reduction.name = 'umap.me3', dims = 2:20)

DefaultAssay(bcell) <- "ac"
feat.keep <- names(which(rowSums(bcell, slot = 'counts') > 1)) # remove low-count features
bcell[['ac']] <- subset(bcell[['ac']], features = feat.keep)
bcell <- FindTopFeatures(bcell, min.cutoff = 2)
bcell <- RunTFIDF(bcell, scale.factor = median(bcell$nCount_ac))
bcell <- RunSVD(bcell, reduction.name = "lsi.ac", scale.embeddings = FALSE)
bcell <- RunUMAP(bcell, reduction = "lsi.ac", reduction.name = 'umap.ac', dims = 2:20)

# WNN
bcell <- FindMultiModalNeighbors(
  object = bcell,
  reduction.list = list('lsi.me3', 'lsi.ac'),
  dims.list = list(2:20, 2:10)
)

bcell <- RunUMAP(bcell, nn.name = "weighted.nn", reduction.name = "umap.wnn")

## run monocle3
bcell@reductions$umap <- bcell@reductions$umap.wnn # needs to be named UMAP for monocle
bcell.cds <- as.cell_data_set(bcell)
bcell.cds <- cluster_cells(cds = bcell.cds, reduction_method = "UMAP")
bcell.cds <- learn_graph(bcell.cds, use_partition = FALSE)
root_cells <- WhichCells(bcell, idents = "HSPC")
bcell.cds <- order_cells(bcell.cds, reduction_method = "UMAP", root_cells = tail(root_cells, 10))

bcell <- AddMetaData(
  object = bcell,
  metadata = bcell.cds@principal_graph_aux@listData$UMAP$pseudotime,
  col.name = "B.cell.pseudotime"
)

fp1 <- FeaturePlot(bcell, "B.cell.pseudotime", reduction = "umap.wnn") +
  scale_color_viridis_c(option = "C") +
  theme_void() +
  ggtitle("") +
  theme(legend.position = "none")

ggsave("plots/bmmc/trajectory.png", plot = fp1, height = 4, width = 4)

## find features correlated with pseudotime
chr.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:22]

bins_me3 <- GenomeBinMatrix(
  fragments = Fragments(bcell[['me3']]),
  genome = chr.use,
  binsize = 10000,
  cells = colnames(bcell)
)
saveRDS(bins_me3, "bins_me3_bcell.rds")
bins_me3 <- readRDS("bins_me3_bcell.rds")

bins_ac <- GenomeBinMatrix(
  fragments = Fragments(bcell[['ac']]),
  genome = chr.use,
  binsize = 10000,
  cells = colnames(bcell)
)
saveRDS(bins_ac, "bins_ac_bcell.rds")
bins_ac <- readRDS("bins_ac_bcell.rds")

DefaultAssay(bcell) <- "me3"
bcell <- FindNeighbors(bcell, reduction = "lsi.me3", dims = 2:20, k = 50)
knn <- bcell[['me3_nn']]

knn_smooth <- function(counts, knn) {
  knn_norm <- knn / rowSums(knn)
  smoothed <- Matrix::tcrossprod(counts, knn_norm)
  return(smoothed)
}

smoothed_me3 <- knn_smooth(counts = bins_me3, knn = knn)
smoothed_ac <- knn_smooth(counts = bins_ac, knn = knn)

# filter regions based on total counts
keep_region <- (rowSums(smoothed_ac) > 5) & (rowSums(smoothed_me3) > 5)

smoothed_me3 <- smoothed_me3[keep_region, ]
smoothed_ac <- smoothed_ac[keep_region, ]

# depth normalize
smoothed_me3 <- t(t(smoothed_me3) / colSums(smoothed_me3))
smoothed_ac <- t(t(smoothed_ac) / colSums(smoothed_ac))

## correlation
pstime <- bcell$B.cell.pseudotime[colnames(bins_me3)]
peak_cor_me3 <- cor(t(as.matrix(smoothed_me3)), pstime)[, 1]
peak_cor_ac <- cor(t(as.matrix(smoothed_ac)), pstime)[, 1]

pv_me3 <- apply(smoothed_me3, 1, function(x) {
  cor.test(x, pstime)$p.value
})
pv_ac <- apply(smoothed_ac, 1, function(x) {
  cor.test(x, pstime)$p.value
})

cor_df <- data.frame(me3 = peak_cor_me3, ac = peak_cor_ac,
                     p_me3 = pv_me3, p_adj_me3 = p.adjust(pv_me3),
                     p_ac = pv_ac, p_adj_ac = p.adjust(pv_ac))
cor_df$diff <- cor_df$me3 - cor_df$ac

# cor 0.2 and difference 0.5
tc <- cor_df[abs(cor_df$diff) > 0.5 & abs(cor_df$me3) > 0.2 & abs(cor_df$ac) > 0.2, ]

# order cells by pseudotime
porder <- order(pstime)
ac_peaks <- smoothed_ac[rownames(tc), porder]
me3_peaks <- smoothed_me3[rownames(tc), porder]

# order based on which point in trajectory has max value using broader smoothing
rowsmoothed <- t(roll_sum(as.matrix(t(me3_peaks)), n = 200))
rownames(rowsmoothed) <- rownames(me3_peaks)
rowmax <- apply(rowsmoothed, 1, which.max)
me3_peaks <- me3_peaks[order(rowmax), ]
ac_peaks <- ac_peaks[order(rowmax), ]

rowsmoothed_ac <- t(roll_sum(as.matrix(t(ac_peaks)), n = 100))
mm_ac <- t(apply(as.matrix(rowsmoothed_ac), 2, rev))

rowsmoothed_me3 <- t(roll_sum(as.matrix(t(me3_peaks)), n = 100))
mm_me3 <- t(apply(as.matrix(rowsmoothed_me3), 2, rev))

ac_max <- quantile(mm_ac, probs = seq(0, 1, 0.05))
me3_max <- quantile(mm_me3, probs = seq(0, 1, 0.05))

me3_plot <- mm_me3
me3_plot[me3_plot > me3_max[['95%']]] <- me3_max[['95%']]
me3_plot[me3_plot < me3_max[['5%']]] <- me3_max[['5%']]

ac_plot <- mm_ac
ac_plot[ac_plot > ac_max[['95%']]] <- ac_max[['95%']]
ac_plot[ac_plot < ac_max[['5%']]] <- ac_max[['5%']]

cols.use <- rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(100))
png("./plots/hmap_ac.png", width = 5000, height = 5000, units = "px")
image(ac_plot, col = cols.use, main = "",
      xaxt = 'n', yaxt = 'n', ylab = "", xlab = "")
dev.off()

png("./plots/hmap_me3.png", width = 5000, height = 5000, units = "px")
image(me3_plot, col = cols.use, main = "",
      xaxt = 'n', yaxt = 'n', ylab = "", xlab = "")
dev.off()

# group peaks by whether they gain me3 or gain ac
tc <- tc[order(tc$me3, decreasing = TRUE), ]
peaks_me3 <- rownames(tc[tc$me3 > 0, ])
peaks_ac <- rownames(tc[tc$ac > 0, ])

cf_me3 <- ClosestFeature(bmmc, region = peaks_me3)
cf_ac <- ClosestFeature(bmmc, region = peaks_ac)

cf_me3 <- cf_me3[cf_me3$gene_biotype == "protein_coding", ]
cf_ac <- cf_ac[cf_ac$gene_biotype == "protein_coding", ]

maxdist <- 50000
closest_me3 <- unique(cf_me3[cf_me3$distance < maxdist, "gene_name"])
closest_ac <- unique(cf_ac[cf_ac$distance < maxdist, "gene_name"])

# save colorbar
png("plots/bmmc/colorbar_heatmap.png", width = 10, height = 4, units = "in", res = 400)
image(matrix(1:100), col=cols.use)
dev.off()

# RNA
bmmc_rna <- readRDS("objects/fullref.Rds")
ct.keep.rna <- c("HSC", "LMPP", "CLP", "pro B", "pre B", "transitional B", "Naive B", "Memory B", "Plasma")
bmmc_rna_bcell <- bmmc_rna[, bmmc_rna$celltype.l2 %in% ct.keep.rna]

# module score for each set of genes
bmmc_rna_bcell <- AddModuleScore(
  object = bmmc_rna_bcell,
  features = list(setdiff(closest_me3, closest_ac)),
  name = "Repressed"
)
bmmc_rna_bcell <- AddModuleScore(
  object = bmmc_rna_bcell,
  features = list(setdiff(closest_ac, closest_me3)),
  name = "Activated"
)

Idents(bmmc_rna_bcell) <- "celltype.l2"
levels(bmmc_rna_bcell) <- c("HSC", "LMPP", "CLP", "pro B", "pre B", "transitional B", "Naive B", "Memory B", "Plasma")

v1 <- VlnPlot(bmmc_rna_bcell, "Repressed1", pt.size = 0) + NoLegend() +
  geom_hline(yintercept = 0) +
  ylab("Module expression") +
  xlab("") +
  ylim(c(-0.15, 0.15)) +
  ggtitle("Repressed genes") +
  theme_classic() +
  theme(legend.position = "none", text = element_text(size = 20)) +
  scale_x_discrete(guide = guide_axis(angle = 45))

v2 <- VlnPlot(bmmc_rna_bcell, "Activated1", pt.size = 0) + NoLegend() +
  geom_hline(yintercept = 0) +
  ylab("Module expression") +
  xlab("") +
  ylim(c(-0.15, 0.15)) +
  ggtitle("Activated genes") +
  theme_classic() +
  theme(legend.position = "none", text = element_text(size = 20)) +
  scale_x_discrete(guide = guide_axis(angle = 45))

ggsave(filename = "plots/bmmc/rna_repressed.pdf", plot = v1, height = 4, width = 5)
ggsave(filename = "plots/bmmc/rna_activated.pdf", plot = v2, height = 4, width = 5)

# add DE test between module scores at start vs end of trajectory
module_activated <- bmmc_rna_bcell$Activated1
module_repressed <- bmmc_rna_bcell$Repressed1

progenitor <- colnames(bmmc_rna_bcell)[bmmc_rna_bcell$celltype.l2 %in% c("HSC", "LMPP")]
mature <- colnames(bmmc_rna_bcell)[bmmc_rna_bcell$celltype.l2 %in% c("Memory B", "Plasma")]

t.test(module_activated[progenitor], module_activated[mature]) # <2.2e-16
t.test(module_repressed[progenitor], module_repressed[mature]) # <2.2e-16

########################
# BMMC atac comparison
########################

bmmc_atac <- readRDS("objects/bmmc_atac.rds")

# subset b cell lineage
ct.keep <- c("01_HSC", "06_CLP.1", "15_CLP.2", "05_CMP.LMPP",
             "17_B", "18_Plasma")
b_atac <- bmmc_atac[, bmmc_atac$BioClassification %in% ct.keep]

DefaultAssay(b_atac) <- "ATAC"
b_atac <- FindTopFeatures(b_atac, min.cutoff = 5)
b_atac <- RunTFIDF(b_atac)
b_atac <- RunSVD(b_atac)
b_atac <- RunUMAP(b_atac, reduction = "lsi", dims = 2:20)
b_atac <- FindNeighbors(b_atac, reduction = "lsi", dims = 2:20)
knn_atac <- b_atac[['ATAC_nn']]

# add numeric score for b trajectory
b_atac$trajectory <- as.numeric(factor(b_atac$BioClassification,
                                       levels = c("01_HSC",
                                                  "05_CMP.LMPP",
                                                  "06_CLP.1",
                                                  "15_CLP.2",
                                                  "17_B",
                                                  "18_Plasma")))

b_atac_traj <- b_atac$trajectory

b_atac$b <- factor(b_atac$BioClassification,
                                       levels = c("01_HSC",
                                                  "05_CMP.LMPP",
                                                  "06_CLP.1",
                                                  "15_CLP.2",
                                                  "17_B",
                                                  "18_Plasma"))


Idents(b_atac) <- "b"
levels(bcell) <- c('HSPC', 'GMP/CLP', 'Pre-B', 'B', 'Plasma')

# check dynamic regions in atac dataset
tc_regions_atac <- FeatureMatrix(
  fragments = Fragments(b_atac),
  features = rownames(tc),
  cells = colnames(b_atac)
)
smoothed_atac <- knn_smooth(counts = tc_regions_atac[, colnames(knn_atac)], knn = knn_atac)
smoothed_atac <- t(t(smoothed_atac) / colSums(smoothed_atac))
peak_cor_atac <- cor(t(as.matrix(smoothed_atac)), b_atac_traj)[, 1]

pv_atac <- apply(smoothed_atac, 1, function(x) {
  cor.test(x, b_atac_traj)$p.value
})

nc <- peak_cor_atac[abs(peak_cor_atac) < 0.05]
ns <- p.adjust(pv_atac[names(nc)]) > 0.01

sum(ns)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
library(Signac)
library(Seurat)

atac <- readRDS("objects/bmmc_atac.rds")
bmmc_dual <- readRDS("objects/bmmc_dual.rds")

bmmc_atac <- atac[, atac$tissue == "BMMC"]

DefaultAssay(bmmc_atac) <- "ATAC"
bmmc_atac <- FindTopFeatures(bmmc_atac)
bmmc_atac <- RunTFIDF(bmmc_atac)
bmmc_atac <- RunSVD(bmmc_atac)
bmmc_atac <- RunUMAP(bmmc_atac, reduction = "lsi", dims = 2:50)

# quantify atac peaks in ntt
counts <- FeatureMatrix(
  fragments = Fragments(bmmc_dual[['ac']])[[1]],
  features = granges(bmmc_atac[['ATAC']]),
  cells = colnames(bmmc_dual)
)

bmmc_dual[['ATAC']] <- CreateChromatinAssay(
  counts = counts, fragments = Fragments(bmmc_dual[['ac']])[[1]]
)

DefaultAssay(bmmc_dual) <- "ATAC"

# normalize with reference IDF
get_idf <- function(x) {
  rsums <- rowSums(x = x)
  idf <- ncol(x = x) / rsums
  idf <- log(1 + idf)
  return(idf)
}

idf <- get_idf(x = GetAssayData(bmmc_atac, assay = "ATAC", slot = "counts"))

bmmc_dual <- RunTFIDF(bmmc_dual, idf = idf)

# find transfer anchors
anchors <- FindTransferAnchors(
  reference = bmmc_atac,
  query = bmmc_dual,
  reduction = "lsiproject",
  dims = 2:30,
  reference.reduction = "lsi",
  reference.assay = "ATAC",
  query.assay = "ATAC"
)

pred <- TransferData(
  anchorset = anchors,
  refdata = bmmc_atac$BioClassification,
  weight.reduction = bmmc_dual[['lsi.me3']],
  dims = 2:50
)

write.table(pred, "datasets/bmmc_dual_predictions.tsv", quote = FALSE, sep = "\t")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
library(Signac)
library(Seurat)

ac <- readRDS("data/ct_pro/H3K27ac.rds")
Fragments(ac) <- NULL
Fragments(ac) <- CreateFragmentObject(
    path = "data/ct_pro/H3K27ac_fragments.tsv.gz",
    cells = Cells(ac)
)
saveRDS(object = ac, file = "data/ct_pro/H3K27ac_updated.rds")

me <- readRDS("data/ct_pro/H3K27me3.rds")
Fragments(me) <- NULL
Fragments(me) <- CreateFragmentObject(
    path = "data/ct_pro/H3K27me3_fragments.tsv.gz",
    cells = Cells(me)
)
saveRDS(object = me, file = "data/ct_pro/H3K27me3_updated.rds")
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pysam
from Bio import SeqIO
import os
import gzip
from pathlib import Path
from argparse import ArgumentParser
import sys

# demux using i5 index only

# parse args
parser = ArgumentParser(description='Read demultiplexer')
parser.add_argument('--read1', help='Path to read1')
parser.add_argument('--read_i5', help='Path to read2')
parser.add_argument('--read2', help='Path to read3')
parser.add_argument('--tn5', help='Path to Tn5 i5 index FASTA file')
parser.add_argument('--output', help='Path to output directory')
args = parser.parse_args()


def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))


def open_fastq(f):
    if os.path.isfile(f):
        if f.endswith(".gz"):
            f_open = gzip.open(f, "rt") # rt read text mode (decodes binary gzip)
        else:
            f_open = open(f, "r")
        return(f_open)
    else:
        raise Exception("File not found")


def read_sample_barcodes(inpath, maxlen=8):
    all_bc = dict()
    for i in SeqIO.parse(open(inpath),'fasta'):
        all_bc[str(i.seq[:maxlen])] = i.id
    return(all_bc)


def get_entry(f):
    return([f.readline(), f.readline(), f.readline(), f.readline()])


def extract_barcodes_simple(sequence, bc1_len=8, spacer_len=14):
    # version with fixed-length barcodes
    sequence = sequence.strip("\n")
    tn5_bc = sequence[:bc1_len]
    cell_bc = sequence[bc1_len+spacer_len:]
    if len(cell_bc) != 16:
        print(cell_bc)
    return((tn5_bc, cell_bc))

tn5_i5_barcodes = read_sample_barcodes(args.tn5, maxlen=8) # using first 8 bases of barcode

r1_read = open_fastq(f=args.read1)
i5_read = open_fastq(f=args.read_i5)
r2_read = open_fastq(f=args.read2)

# create dictionary with file handles
outf = dict()
outpath = Path(args.output)
if not outpath.exists():
    os.mkdir(outpath)

unique_i5_marks = [tn5_i5_barcodes[key] for key in tn5_i5_barcodes.keys()]
unique_i5_marks.append('unknown')
for i5 in unique_i5_marks:
    # outputting uncompressed fastq is ~10x faster
    fname_1 = open(outpath / (i5 + ".R1.fastq"), "w+")
    fname_2 = open(outpath / (i5 + ".R2.fastq"), "w+")
    outf[i5] = (fname_1, fname_2)

x = 0
while True:
    r1_entry = get_entry(r1_read)
    r2_entry = get_entry(r2_read)
    i5_entry = get_entry(i5_read)

    if r1_entry[0] == '':
        break

    # get barcodes
    tn5_barcode, cell_barcode = extract_barcodes_simple(sequence=i5_entry[1], bc1_len=8)

    if tn5_barcode in tn5_i5_barcodes.keys():
        mark = tn5_i5_barcodes[tn5_barcode]
    else:
        # compute hamming distance
        hams = [hamming_distance(tn5_barcode, x) < 3 for x in tn5_i5_barcodes.keys()]
        if sum(hams) == 1:
            mark = tn5_i5_barcodes[list(tn5_i5_barcodes.keys())[hams.index(True)]]
        else:
            mark = "unknown"

    # add barcodes to r1 and r2 genomic
    bc_combination = "@" + cell_barcode + ":" + tn5_barcode + "+"
    r1_entry[0] = bc_combination + r1_entry[0][1:]
    r2_entry[0] = bc_combination + r2_entry[0][1:]

    if mark == "unknown":
        r1_outf = outf['unknown'][0]
        r2_outf = outf['unknown'][1]
    else:
        # write to correct output files based on barcode combination
        r1_outf = outf[mark][0]
        r2_outf = outf[mark][1]

    r1_outf.write("".join(r1_entry))
    r2_outf.write("".join(r2_entry))
    x += 1
    if x % 1e6 == 0:
        print("Processed " + str(int(x/1e6)) + " million reads", file=sys.stderr, end="\r")

# close all files
for i in outf.keys():
    outf[i][0].close()
    outf[i][1].close()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
library(GenomicRanges)

me3_peaks <- c("ENCFF138FVQ", "ENCFF720AKK",
               "ENCFF407VKP","ENCFF588JEN",
               "ENCFF570DGJ", "ENCFF071JFL",
               "ENCFF276IMO")

ac_peaks <- c("ENCFF240LSH", "ENCFF614FJO",
              "ENCFF449ITG", "ENCFF321NYQ",
              "ENCFF544QYY", "ENCFF059WXH",
              "ENCFF412NPA")

bulk_me3 <- "data/encode/ENCFF291LVP.bed.gz"
bulk_ac <- "data/encode/ENCFF832RWT.bed.gz"

me3_bedfiles <- paste0("data/encode/", me3_peaks, ".bed.gz")
ac_bedfiles <- paste0("data/encode/", ac_peaks, ".bed.gz")

read_bed <- function(filepath) {
    df <- read.table(file = filepath,
                     col.names = c("chr", "start", "end",
                                   "name", "score", "strand",
                                   "fold_change", "neg_log10pvalue_summit",
                                   "neg_log10qvalue_summit", "relative_summit_position"))
    gr <- makeGRangesFromDataFrame(df = df, keep.extra.columns = TRUE, 
        starts.in.df.are.0based = TRUE)
    return(gr)
}

me3_ranges <- sapply(me3_bedfiles, read_bed)
ac_ranges <- sapply(ac_bedfiles, read_bed)

all_me3 <- Reduce(f = c, x = me3_ranges)
me3 <- reduce(all_me3)
me3 <- keepStandardChromosomes(me3, pruning.mode = "coarse")
me3 <- me3[width(me3) < 20000]

all_ac <- Reduce(f = c, x = ac_ranges)
ac <- reduce(all_ac)
ac <- keepStandardChromosomes(ac, pruning.mode = "coarse")
ac <- ac[width(ac) < 20000]

combined <- c(ac, me3)

ac <- as.data.frame(ac)
write.table(x = ac, file = "data/encode/h3k27ac.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
me3 <- as.data.frame(me3)
write.table(x = me3, file = "data/encode/h3k27me3.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
combined <- as.data.frame(combined)
write.table(x = combined, file = "data/encode/all.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

# bulk peaks
ac_bulk <- read_bed(bulk_ac)
me3_bulk <- read_bed(bulk_me3)
all_bulk <- c(ac_bulk, me3_bulk)
all_bulk <- as.data.frame(all_bulk)
write.table(x = all_bulk, file = "data/encode/all_bulk.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
library(Signac)
library(ggplot2)

bigWigs <- list(
  "RNAPII (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-Pol2.bw",
  "H3K27me3 (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-K27me.bw",
  "H3K27ac (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-K27ac.bw",
  "RNAPII (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bw",
  "H3K27me3 (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-K27me.bw",
  "H3K27ac (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bw"
)

bw <- BigwigTrack(
  region = "chr11-69900000-70910000",
  bigwig = bigWigs,
  y_label = "Normalized coverage",
  ymax = "q90",
  bigwig.scale = "common",
  smooth = 1000
) + theme(legend.position = "none")

bw <- bw + scale_fill_manual(values = c("#036C9A", "#D3145A", "#F98401",
                                        "#676767", "#676767", "#676767"))

ggsave(
  filename = "plots/hek_k562_bulk/bulk_covplot_k562.png",
  plot = bw,
  height = 5,
  width = 10,
  units = "in"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
library(GenomicRanges)

regions_s5 <- read.table("data/k562_peaks/ENCFF053XYZ.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s5 <- makeGRangesFromDataFrame(regions_s5, keep.extra.columns = TRUE)

regions_s2 <- read.table("data/k562_peaks/ENCFF266OPF.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s2 <- makeGRangesFromDataFrame(regions_s2, keep.extra.columns = TRUE)

regions_rna <- reduce(c(regions_s2, regions_s5))

regions_rna <- as.data.frame(regions_rna)
write.table(regions_rna, file = "data/k562_peaks/rna.bed", col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
library(Signac)
library(Seurat)
library(ggplot2)
library(patchwork)
library(GenomeInfoDb)
library(BSgenome.Hsapiens.UCSC.hg38)
library(GenomicRanges)
library(Rsamtools)

obj <- readRDS("objects/hek_k562.rds")

# dimplot for figure 1
p_ac <- DimPlot(obj, reduction = "umap.k27ac", pt.size = 0.1) + ggtitle("H3K27ac") + theme_void() + theme(legend.position = "none")
p_me <- DimPlot(obj, reduction = "umap.k27me", pt.size = 0.1) + ggtitle("H3K2me3") + theme_void() + theme(legend.position = "none")
p_pol <- DimPlot(obj, reduction = "umap.pol2", pt.size = 0.1) + ggtitle("RNAPII") + theme_void() + theme(legend.position = "none")
p_wnn <- DimPlot(obj, reduction = "wnn.umap") + ggtitle("H3K27me3 + H3K27ac + RNAPII") + theme_classic() + xlab("UMAP 1") + ylab("UMAP 2") + theme(legend.position = "none")

p1 <- ((p_ac / p_me / p_pol) | p_wnn) + plot_layout(widths = c(1, 2))

ggsave("plots/hek_k562/dimplot_hek_k562.pdf", plot = p1, height = 5, width = 8)

# bulk data for comparison
hek_mono_k27ac <- "data/HEK_K562_bulk/mapped/HEK-mono-K27ac.bw"
hek_mono_k27me3 <- "data/HEK_K562_bulk/mapped/HEK-mono-K27me.bw"
hek_mono_s2s5p <- "data/HEK_K562_bulk/mapped/HEK-mono-Pol2.bw"
k562_mono_k27ac <- "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bw"
k562_mono_k27me3 <- "data/HEK_K562_bulk/mapped/K562-mono-K27me.bw"
k562_mono_s2s5p <- "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bw"

DefaultAssay(obj) <- "K27ac"

p <- CoveragePlot(
  object = obj,
  region = "SCMH1",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 20000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
    filename = "plots/SCMH1_k562.png",
    plot = p,
    dpi = 600,
    units = "in",
    height = 5,
    width = 15
)

# additional regions for supplement
p <- CoveragePlot(
  object = obj,
  region = "NASP",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 20000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
  filename = "plots/NASP_k562.png",
  plot = p,
  dpi = 600,
  units = "in",
  height = 5,
  width = 15
)

p <- CoveragePlot(
  object = obj,
  region = "FAF1",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 50000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
  filename = "plots/FAF1_k562.png",
  plot = p,
  dpi = 600,
  units = "in",
  height = 5,
  width = 15
)

p <- CoveragePlot(
  object = obj,
  region = "SDAD1",
  idents = "K562",
  assay = list("K27ac", "K27me", "Pol2"),
  extend.upstream = 100000,
  extend.downstream = 100000,
  window = 50,
  split.assays = TRUE,
  peaks = FALSE,
  bigwig = list("H3K27ac" = k562_mono_k27ac,
                "H3K27me3" = k562_mono_k27me3,
                "RNAPII" = k562_mono_s2s5p)
) & NoLegend()

colors.use <- list("#F98401", "#D3145A", "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

p <- p[[1]]
p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use)
p[[2]] <- p[[2]] & scale_fill_manual(values =  c("darkgrey", "darkgrey", "darkgrey"))

ggsave(
  filename = "plots/SDAD1_k562.png",
  plot = p,
  dpi = 600,
  units = "in",
  height = 5,
  width = 15
)

# multiCUT&Tag comparison
# same
me3_me3 <- "data/multict/fragments/H3K27me3-H3K27me3.tsv.gz"
ac_ac <- "data/multict/fragments/H3K27ac-H3K27ac.tsv.gz"

# mixed
ac_me3 <- "data/multict/fragments/H3K27ac-H3K27me3.tsv.gz"
me3_ac <- "data/multict/fragments/H3K27me3-H3K27ac.tsv.gz"

totals_mct <- CountFragments(
  fragments = list(me3_me3, ac_ac, ac_me3, me3_ac)
)

cells_keep_mct <- totals_mct[totals_mct$frequency_count > 200, "CB"]

# count fragments for each mark
total_me3_mct <- CountFragments(fragments = me3_me3, cells = cells_keep_mct)
total_ac_mct <- CountFragments(fragments = ac_ac, cells = cells_keep_mct)

# count mixed 1/2 for each
total_me3_mixed_mct <- CountFragments(fragments = me3_ac, cells = cells_keep_mct)
total_ac_mixed_mct <- CountFragments(fragments = ac_me3, cells = cells_keep_mct)
total_me3_mixed_mct$frequency_count <- total_me3_mixed_mct$frequency_count / 2
total_ac_mixed_mct$frequency_count <- total_ac_mixed_mct$frequency_count / 2

mct_counts_me3 <- total_me3_mct$frequency_count
names(mct_counts_me3) <- total_me3_mct$CB
mct_counts_me3[total_me3_mixed_mct$CB] <- mct_counts_me3[total_me3_mixed_mct$CB] + total_me3_mixed_mct$frequency_count
mct_counts_me3[total_ac_mixed_mct$CB] <- mct_counts_me3[total_ac_mixed_mct$CB] + total_ac_mixed_mct$frequency_count

mct_counts_ac <- total_ac_mct$frequency_count
names(mct_counts_ac) <- total_ac_mct$CB
mct_counts_ac[total_ac_mixed_mct$CB] <- mct_counts_ac[total_ac_mixed_mct$CB] + total_ac_mixed_mct$frequency_count
mct_counts_ac[total_me3_mixed_mct$CB] <- mct_counts_ac[total_me3_mixed_mct$CB] + total_me3_mixed_mct$frequency_count

# same for reads
mct_reads_me3 <- total_me3_mct$reads_count
names(mct_reads_me3) <- total_me3_mct$CB
mct_reads_me3[total_me3_mixed_mct$CB] <- mct_reads_me3[total_me3_mixed_mct$CB] + total_me3_mixed_mct$reads_count
mct_reads_me3[total_ac_mixed_mct$CB] <- mct_reads_me3[total_ac_mixed_mct$CB] + total_ac_mixed_mct$reads_count

mct_reads_ac <- total_ac_mct$reads_count
names(mct_reads_ac) <- total_ac_mct$CB
mct_reads_ac[total_ac_mixed_mct$CB] <- mct_reads_ac[total_ac_mixed_mct$CB] + total_ac_mixed_mct$reads_count
mct_reads_ac[total_me3_mixed_mct$CB] <- mct_reads_ac[total_me3_mixed_mct$CB] + total_me3_mixed_mct$reads_count

count_df <- data.frame(count = c(mct_counts_me3, mct_counts_ac),
                       mark = c(rep("H3K27me3", length(mct_counts_me3)),
                                rep("H3K27ac", length(mct_counts_ac))),
                       dataset = "multiCUT&Tag",
                       reads = c(mct_reads_me3, mct_reads_ac)
                       )

# ntt
total_me3 <- CountFragments(
    fragments = "data/HEK_K562_sc/sinto/K27me.bed.gz",
    cells = colnames(obj)
)
total_ac <- CountFragments(
    fragments = "data/HEK_K562_sc/sinto/K27ac.bed.gz",
    cells = colnames(obj)
)
total_pol <- CountFragments(
    fragments = "data/HEK_K562_sc/sinto/Pol2.bed.gz",
    cells = colnames(obj)
)

# construct total vectors
me3 <- total_me3$frequency_count
names(me3) <- total_me3$CB
df1 <- data.frame(
    count = me3,
    mark = "H3K27me3",
    dataset = "scNTT-seq"
)
df1$reads <- total_me3$reads_count

ac <- total_ac$frequency_count
names(ac) <- total_ac$CB
df2 <- data.frame(
    count = ac,
    mark = "H3K27ac",
    dataset = "scNTT-seq"
)
df2$reads <- total_ac$reads_count

pol <- total_pol$frequency_count
names(pol) <- total_pol$CB
df3 <- data.frame(
    count = pol,
    mark = "RNAPII",
    dataset = "scNTT-seq"
)
df3$reads <- total_pol$reads_count

count_df <- rbind(count_df, df1, df2, df3)
count_df <- rbind(count_df, data.frame(count=c(1,1), mark="RNAPII", dataset="multiCUT&Tag", reads=c(1,1)))

# total fragments, split by mark
p3 <- ggplot(data = count_df, mapping = aes(mark, reads, fill=dataset)) +
  geom_boxplot(outlier.size = 0.1) +
  scale_y_log10() +
  theme_bw() +
  ylab("Total reads per cell") +
  xlab("Antibody") +
  theme(legend.position = "none") +
  ggtitle("Reads per cell")

p4 <- ggplot(data = count_df, mapping = aes(mark, count, fill=dataset)) +
  geom_boxplot(outlier.size = 0.1) +
  scale_y_log10() +
  theme_bw() +
  ylab("Total fragments per cell") +
  xlab("Antibody") +
  ggtitle("Fragments per cell")

ggsave(
    filename = "plots/hek_k562/sensitivity.png",
    plot = (p3 | p4),
    height = 4,
    width = 7,
    dpi = 500
)

# Region heatmaps
atac_peaks <- rtracklayer::import("data/K562_ATAC/ENCFF006OFA.bigBed")
atac_peaks <- reduce(atac_peaks)
chr.keep <- c(paste0("chr", 1:22), "chrX")
atac_peaks <- keepSeqlevels(atac_peaks, value = chr.keep, pruning.mode = "coarse")
atac_peaks <- subsetByOverlaps(atac_peaks, ranges = blacklist_hg38_unified, invert = TRUE)

regions_me3 <- read.table("data/k562_peaks/ENCFF031FSF.bed.gz", sep = "\t",
                          col.names = c("chromosome", "start", "end", "name", "score",
                                        "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
regions_me3$peak <- "H3K27me3"

regions_ac <- read.table("data/k562_peaks/ENCFF038DDS.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
regions_ac$peak <- "H3K27ac"

regions_s5 <- read.table("data/k562_peaks/ENCFF053XYZ.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s5 <- makeGRangesFromDataFrame(regions_s5, keep.extra.columns = TRUE)

regions_s2 <- read.table("data/k562_peaks/ENCFF266OPF.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_s2 <- makeGRangesFromDataFrame(regions_s2, keep.extra.columns = TRUE)

regions_rna <- reduce(c(regions_s2, regions_s5))
regions_rna$peak <- "RNAPII"

regions <- c(regions_me3, regions_ac, regions_rna)
regions <- keepStandardChromosomes(regions, pruning.mode = "coarse")
regions <- dropSeqlevels(regions, value = "chrM", pruning.mode = "coarse")
regions <- dropSeqlevels(regions, value = "chrY", pruning.mode = "coarse")

atac_chr1 <- atac_peaks[seqnames(atac_peaks) == 'chr1']
me3_peaks <- regions_me3[seqnames(regions_me3) == 'chr1']
ac_peaks <- regions_ac[seqnames(regions_ac) == 'chr1']
pol2_peaks <- regions_rna[seqnames(regions_rna) == 'chr1']

# ATAC
obj <- RegionMatrix(
    object = obj,
    regions = atac_chr1,
    key = "ATAC",
    assay = "K27ac"
)
obj <- RegionMatrix(
    object = obj,
    regions = atac_chr1,
    key = "ATAC",
    assay = "K27me"
)
obj <- RegionMatrix(
    object = obj,
    regions = atac_chr1,
    key = "ATAC",
    assay = "Pol2"
)

# me3
obj <- RegionMatrix(
  object = obj,
  regions = me3_peaks,
  key = "me3",
  assay = "K27ac"
)
obj <- RegionMatrix(
  object = obj,
  regions = me3_peaks,
  key = "me3",
  assay = "K27me"
)
obj <- RegionMatrix(
  object = obj,
  regions = me3_peaks,
  key = "me3",
  assay = "Pol2"
)

# ac
obj <- RegionMatrix(
  object = obj,
  regions = ac_peaks,
  key = "ac",
  assay = "K27ac"
)
obj <- RegionMatrix(
  object = obj,
  regions = ac_peaks,
  key = "ac",
  assay = "K27me"
)
obj <- RegionMatrix(
  object = obj,
  regions = ac_peaks,
  key = "ac",
  assay = "Pol2"
)

# pol2
obj <- RegionMatrix(
  object = obj,
  regions = pol2_peaks,
  key = "pol2",
  assay = "K27ac"
)
obj <- RegionMatrix(
  object = obj,
  regions = pol2_peaks,
  key = "pol2",
  assay = "K27me"
)
obj <- RegionMatrix(
  object = obj,
  regions = pol2_peaks,
  key = "pol2",
  assay = "Pol2"
)

# plots
cm <- list("K27ac" = "#F98401", "K27me" = "#D3145A", "Pol2" = "#036C9A")
p4 <- RegionHeatmap(
  object = obj,
  assay = c("K27ac", "K27me", "Pol2"),
  cols = cm,
  key = "ATAC",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("ATAC peaks")

p5 <- RegionHeatmap(
  object = obj,
  assay = c("K27me", "K27ac", "Pol2"),
  cols = cm,
  key = "me3",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("H3K27me3 peaks")

p6 <- RegionHeatmap(
  object = obj,
  assay = c("K27ac", "K27me", "Pol2"),
  cols = cm,
  key = "ac",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("H3K27ac peaks")

p7 <- RegionHeatmap(
  object = obj,
  assay = c("Pol2", "K27ac", "K27me"),
  cols = cm,
  key = "pol2",
  min.counts = 20,
  normalize = FALSE,
  window = 50,
  nrow = 1,
  idents = "K562"
) & ylab("RNAPII peaks")

update_plot <- function(p, y) {
  p[[1]] <- p[[1]] + ylab(y)
  p[[2]] <- p[[2]] + theme(axis.title.y=element_blank(),
                           axis.text.y=element_blank())
  p[[3]] <- p[[3]] + theme(axis.title.y=element_blank(),
                           axis.text.y=element_blank())
  return(p)
}

p4 <- update_plot(p4, y = "ATAC peaks")
p5 <- update_plot(p5, y = "H3K27me3 peaks")
p6 <- update_plot(p6, y = "H3K27ac peaks")
p7 <- update_plot(p7, y = "RNAPII peaks")

pp <- (p4 | p5 | p6 | p7) & theme(legend.position = "none")
ggsave(filename = "plots/hek_k562/region_heatmaps.png", plot = pp, height = 6, width = 12)

# genome-wide correlation plot

# bulk fragfiles
k562_mono_k27ac_frag <- "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bed.gz"
k562_mono_k27me3_frag <- "data/HEK_K562_bulk/mapped/K562-mono-K27me.bed.gz"
k562_mono_s2s5p_frag <- "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bed.gz"

# split single-cell
sc_k27_ac <- "data/HEK_K562_sc/split/K27ac_k562_filter.bed.gz"
sc_k27_me <- "data/HEK_K562_sc/split/K27me_k562_filter.bed.gz"
sc_pol <- "data/HEK_K562_sc/split/Pol2_k562_filter.bed.gz"

QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # get total fragments for normalization
  # just read whole thing
  f <- read.table(fragfile, sep = "\t", header = FALSE)
  total <- nrow(f)
  rm(f)
  gc()

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / total) * 10^6
  return(cmp)
}


# need bulk vs bulk
# bulk vs single cell
# single cell vs single cell

k27ac_cov <- QuantifyRegions(k562_mono_k27ac_frag, regions)
k27me3_cov <- QuantifyRegions(k562_mono_k27me3_frag, regions)
pol2_cov <- QuantifyRegions(k562_mono_s2s5p_frag, regions)

pol2_sc_cov <- QuantifyRegions(sc_pol, regions)
ac_sc_cov <- QuantifyRegions(sc_k27_ac, regions)
me_sc_cov <- QuantifyRegions(sc_k27_me, regions)

# create matrix
mat <- cbind(k27ac_cov, ac_sc_cov, k27me3_cov, me_sc_cov, pol2_cov, pol2_sc_cov)

correlation_matrix <- cor(mat)
colnames(correlation_matrix) <- c("Bulk H3K27ac", "sc H3K27ac",
                                  "Bulk H3K27me3", "sc H3K27me3",
                                  "Bulk RNAPII", "sc RNAPII")
rownames(correlation_matrix) <- colnames(correlation_matrix)
df <- as.data.frame(as.table(correlation_matrix))

p8 <- ggplot(df, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  scale_fill_distiller(palette = "RdYlBu") +
  xlab("") + ylab("") +
  guides(fill = guide_legend(title = "Pearson\ncorrelation")) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

ggsave(filename = "plots/hek_k562/correlation.png", plot = p8, height = 5, width = 6)

# scatterplots

# 1. use encode peak regions for each mark
# 2. plot one mark vs another
# 3. fit linear model

get_cod_expression <- function(m) {
  eq <- substitute(~~italic(R)^2~"="~r2, 
                   list(r2 = format(summary(m)$r.squared, digits = 2)))
  cod <- as.expression(eq)
  return(cod)
}

mat <- as.data.frame(mat)
mat$region <- regions$peak
colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

# bulk-bulk
m.use <- mat[mat$region %in% c("H3K27me3", "H3K27ac"), ]
bulk_ac_me3 <- ggplot(m.use, aes(k27ac_cov, k27me3_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("H3K27me3 (bulk, single antibody)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ k27me3_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("RNAPII", "H3K27ac"), ]
bulk_ac_pol <- ggplot(m.use,
                      aes(k27ac_cov, pol2_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 300)) + ylim(c(0, 300)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("RNAPII (bulk, single antibody)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ pol2_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("RNAPII", "H3K27me3"), ]
bulk_pol_me <- ggplot(m.use,
                      aes(pol2_cov, k27me3_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("RNAPII (bulk, single antibody)") +
  ylab("H3K27me3 (bulk, single antibody)") +
  ggtitle(get_cod_expression(lm(pol2_cov ~ k27me3_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# bulk-sc
m.use <- mat[mat$region %in% c("H3K27ac", "H3K27me3"), ]
bulk_ac_sc_me3 <- ggplot(m.use,
                         aes(k27ac_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 800)) + ylim(c(0, 800)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27ac", "RNAPII"), ]
bulk_ac_sc_pol <- ggplot(m.use,
                         aes(k27ac_cov, pol2_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("RNAPII (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ pol2_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27me3", "RNAPII"), ]
bulk_pol_sc_me <- ggplot(m.use,
                         aes(pol2_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("RNAPII (bulk, single antibody)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(pol2_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27ac"), ]
bulk_ac_sc_ac <- ggplot(m.use,
                        aes(k27ac_cov, ac_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  scale_color_manual(values = colormap) +
  xlim(c(0, 600)) + ylim(c(0, 600)) +
  xlab("H3K27ac (bulk, single antibody)") +
  ylab("H3K27ac (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27ac_cov ~ ac_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27me3"), ]
bulk_me_sc_me <- ggplot(m.use,
                        aes(k27me3_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 300)) + ylim(c(0, 300)) +
  xlab("H3K27me3 (bulk, single antibody)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(k27me3_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("RNAPII"), ]
bulk_pol_sc_pol <- ggplot(m.use,
                          aes(pol2_cov, pol2_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  scale_color_manual(values = colormap) +
  xlim(c(0, 200)) + ylim(c(0, 200)) +
  xlab("RNAPII (bulk, single antibody)") +
  ylab("RNAPII (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(pol2_cov ~ pol2_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# sc-sc
m.use <- mat[mat$region %in% c("H3K27ac", "H3K27me3"), ]
sc_ac_sc_me3 <- ggplot(m.use,
                       aes(ac_sc_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single-cell, multiplexed)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27ac", "RNAPII"), ]
sc_ac_sc_pol <- ggplot(m.use,
                       aes(ac_sc_cov, pol2_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single-cell, multiplexed)") +
  ylab("RNAPII (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ pol2_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region %in% c("H3K27me3", "RNAPII"), ]
sc_pol_sc_me <- ggplot(m.use,
                       aes(pol2_sc_cov, me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 200)) + ylim(c(0, 200)) +
  xlab("RNAPII (single-cell, multiplexed)") +
  ylab("H3K27me3 (single-cell, multiplexed)") +
  ggtitle(get_cod_expression(lm(pol2_sc_cov ~ me_sc_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

pp <- (bulk_ac_me3 / bulk_ac_pol / bulk_pol_me) | (bulk_ac_sc_me3 / bulk_ac_sc_pol / bulk_pol_sc_me) | (sc_ac_sc_me3 / sc_ac_sc_pol / sc_pol_sc_me) | (bulk_me_sc_me / bulk_ac_sc_ac / bulk_pol_sc_pol)
pp <- pp & NoLegend()
ggsave(filename = "plots/hek_k562/scatterplots.png", plot = pp, height = 10, width = 12)

# ternary plot for single-cell data
if (!requireNamespace(package = "ggtern", quietly = TRUE)) {
  install.packages("ggtern")
}
library(ggtern)

region_counts <- mat[, c("region", "ac_sc_cov", "me_sc_cov", "pol2_sc_cov")]
colnames(region_counts) <- c("region", "H3K27ac", "H3K27me3", "RNAPII")

# randomize plotting order
region_counts <- region_counts[sample(1:nrow(region_counts)), ]

gp <- ggtern(region_counts, aes(x = H3K27ac, y = H3K27me3, z = RNAPII, color = region)) +
  geom_point(size = 0.1, alpha = 0.2) +
  scale_color_manual(values = colors.use) +
  theme_classic()

ggsave(filename = "plots/hek_k562/ternary.png", plot = gp, height = 8, width = 8)

## KNN purity ## 
if (!requireNamespace(package = "RANN", quietly = TRUE)) {
  install.packages("RANN")
}
library(RANN)

knn_purity_emb <- function(embeddings, dims, clusters, k) {
  nn <- nn2(data = embeddings[, dims], k = k + 1)$nn.idx[, 2:k]
  nn_purity <- vector(mode = "numeric", length = length(x = clusters))
  for (i in seq_len(length.out = nrow(x = nn))) {
    nn_purity[i] <- sum(clusters[nn[i, ]] == clusters[i]) / k
  }
  return(nn_purity)
}

knn_purity_graph <- function(graph, clusters) {
  # pre-existing graph, can't change k
  nn_purity <- vector(mode = "numeric", length = length(x = clusters))
  for (i in seq_len(length.out = nrow(x = graph))) {
    nbr <- names(which(graph[i, ] == 1))
    nn_purity[i] <- sum(clusters[nbr] == clusters[i]) / sum(graph[i, ])
  }
  return(nn_purity)
}

# compute wnn for each combination of modalities
obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_all",
  reduction.list = list("lsi.k27ac", "lsi.k27me", "lsi.pol2"), 
  dims.list = list(2:10, 2:10, 2:10)
)

obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_ac_me3",
  reduction.list = list("lsi.k27ac", "lsi.k27me"), 
  dims.list = list(2:10, 2:10)
)

obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_ac_pol2",
  reduction.list = list("lsi.k27ac", "lsi.pol2"), 
  dims.list = list(2:10, 2:10)
)

obj <- FindMultiModalNeighbors(
  object = obj,
  knn.graph.name = "wknn_me3_pol2",
  reduction.list = list("lsi.k27me", "lsi.pol2"), 
  dims.list = list(2:10, 2:10)
)

dims <- 2:10
k <- 50
emb_me3 <- Embeddings(obj[['lsi.k27me']])
emb_ac <- Embeddings(obj[['lsi.k27ac']])
emb_pol <- Embeddings(obj[['lsi.pol2']])

ct <- obj$celltype

kme3 <- data.frame(
  purity = knn_purity_emb(embeddings = emb_me3, dims = dims, clusters = ct, k = k),
  assay = "H3K27me3",
  celltype = ct
)
kac <- data.frame(
  purity = knn_purity_emb(embeddings = emb_ac, dims = dims, clusters = ct, k = k),
  assay = "H3K27ac",
  celltype = ct
)
kpol <- data.frame(
  purity = knn_purity_emb(embeddings = emb_pol, dims = dims, clusters = ct, k = k),
  assay = "RNAPII",
  celltype = ct
)

kme3_ac <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_ac_me3']], clusters = ct),
  assay = "H3K27me3 + H3K27ac",
  celltype = ct
)
kac_pol <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_ac_pol2']], clusters = ct),
  assay = "H3K27ac + RNAPII",
  celltype = ct
)
kpol_me <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_me3_pol2']], clusters = ct),
  assay = "H3K27me3 + RNAPII",
  celltype = ct
)
kpol_me_ac <- data.frame(
  purity = knn_purity_graph(graph = obj[['wknn_all']], clusters = ct),
  assay = "H3K27me3 + H3K27ac + RNAPII",
  celltype = ct
)

kdf <- rbind(kme3, kac, kpol, kme3_ac, kac_pol, kpol_me, kpol_me_ac)
kdf$assay <- factor(kdf$assay, levels = c("RNAPII", "H3K27ac", "H3K27me3",
                                             "H3K27ac + RNAPII", "H3K27me3 + RNAPII", "H3K27me3 + H3K27ac",
                                             "H3K27me3 + H3K27ac + RNAPII"))

knn_p <- ggplot(kdf, aes(y = assay, x = purity, fill = assay)) +
  ggridges::geom_density_ridges(bandwidth = 0.008) +
  xlab("Fraction of neighbors of same cell type") + ylab("Assay") +
  theme_classic() +
  scale_fill_manual(values = c("#036C9A", "#F98401", "#D3145A", "#7e784e", "#6b407a", "#e64c2e", "#9a5752")) +
  xlim(c(0.8, 1.01)) +
  theme(legend.position = "none", text = element_text(size = 12)) + 
  geom_vline(xintercept = 1)

ggsave(filename = "plots/hek_k562/knn_purity_wnn.png", plot = knn_p, height = 4, width = 7)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
library(Signac)
library(ggplot2)
library(patchwork)
library(Rsamtools)


QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # get total fragments for normalization
  # just read whole thing
  f <- read.table(fragfile, sep = "\t", header = FALSE)
  total <- nrow(f)
  rm(f)
  gc()

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / total) * 10^6
  return(cmp)
}

get_cod_expression <- function(m) {
  eq <- substitute(~~italic(R)^2~"="~r2, 
                   list(r2 = format(summary(m)$r.squared, digits = 2)))
  cod <- as.expression(eq)
  return(cod)
}

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401")

# multiCUT&Tag
# same
me3_me3 <- "data/multict/fragments/H3K27me3-H3K27me3.tsv.gz"
ac_ac <- "data/multict/fragments/H3K27ac-H3K27ac.tsv.gz"

# mixed
ac_me3 <- "data/multict/fragments/H3K27ac-H3K27me3.tsv.gz"
me3_ac <- "data/multict/fragments/H3K27me3-H3K27ac.tsv.gz"

# peaks
h3k27me3 <- read.table("data/mesc/ENCFF008XKX.bed.gz", sep = "\t") # H3K27me3 ES-Bruce4 mm10
h3k27ac <- read.table("data/mesc/ENCFF360VIS.bed.gz", sep = "\t") # H3K27ac ES-Bruce4 mm10

h3k27me3 <- makeGRangesFromDataFrame(
  df = h3k27me3, seqnames.field = "V1", start.field = "V2", end.field = "V3"
)
h3k27ac <- makeGRangesFromDataFrame(
  df = h3k27ac, seqnames.field = "V1", start.field = "V2", end.field = "V3"
)

h3k27me3 <- keepStandardChromosomes(h3k27me3, pruning.mode = "coarse")
h3k27ac <- keepStandardChromosomes(h3k27ac, pruning.mode = "coarse")

totals_mct <- CountFragments(
  fragments = list(me3_me3, ac_ac, ac_me3, me3_ac)
)

cells_keep_mct <- totals_mct[totals_mct$frequency_count > 200, "CB"]

me3_me3_filt <- "data/multict/fragments/H3K27me3-H3K27me3_filtered.tsv.gz"
ac_ac_filt <- "data/multict/fragments/H3K27ac-H3K27ac_filtered.tsv.gz"
ac_me3_filt <- "data/multict/fragments/H3K27ac-H3K27me3_filtered.tsv.gz"
me3_ac_filt <- "data/multict/fragments/H3K27me3-H3K27ac_filtered.tsv.gz"

# filter cells
FilterCells(
  fragments = me3_me3,
  cells = cells_keep_mct,
  outfile = me3_me3_filt
)
FilterCells(
  fragments = ac_ac,
  cells = cells_keep_mct,
  outfile = ac_ac_filt
)
FilterCells(
  fragments = ac_me3,
  cells = cells_keep_mct,
  outfile = ac_me3_filt
)
FilterCells(
  fragments = me3_ac,
  cells = cells_keep_mct,
  outfile = me3_ac_filt
)

h3k27me3$peak <- "H3K27me3"
h3k27ac$peak <- "H3K27ac"
regions <- c(h3k27me3, h3k27ac)

###########
# scatter #
###########

# quantify regions
me3_me3 <- QuantifyRegions(me3_me3_filt, regions)
me3_ac <- QuantifyRegions(me3_ac_filt, regions)
ac_ac <- QuantifyRegions(ac_ac_filt, regions)
ac_me3 <- QuantifyRegions(ac_me3_filt, regions)

mat <- cbind(me3_me3, me3_ac, ac_ac, ac_me3)
mat <- as.data.frame(mat)
mat$region <- regions$peak

mat$me3 <- mat$me3_me3 + (mat$me3_ac/2) + (mat$ac_me3/2)
mat$ac <- mat$ac_ac + (mat$me3_ac/2) + (mat$ac_me3/2)

p_me3_ac <- ggplot(mat, aes(x = ac, y = me3, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  theme(legend.position = 'none') +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  ylab("H3K27me3") +
  xlab("H3K27ac") +
  ggtitle(get_cod_expression(lm(me3 ~ ac, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ggsave(filename = "plots/hek_k562/multi_cuttag_scatter.png", plot = p_me3_ac, height = 4, width = 4)

############
### FRiP ###
############

get_total <- function(frags, cells) {
  tot <- CountFragments(fragments = frags@path, cells = cells)
  totals <- tot$frequency_count
  names(totals) <- tot$CB
  return(totals)
}

get_in_peaks <- function(obj, regions) {
  counts <- FeatureMatrix(
    fragments = obj,
    features = regions,
    cells = obj@cells
  )
  return(colSums(counts))
}


me3_me3_obj <- CreateFragmentObject(me3_me3_filt, cells = cells_keep_mct)
ac_ac_obj <- CreateFragmentObject(ac_ac_filt, cells = cells_keep_mct)
me3_ac_obj <- CreateFragmentObject(me3_ac_filt, cells = cells_keep_mct)
ac_me3_obj <- CreateFragmentObject(ac_me3_filt, cells = cells_keep_mct)

totals_me3_me3 <- get_total(me3_me3_obj, cells = cells_keep_mct)
totals_ac_me3 <- get_total(ac_me3_obj, cells = cells_keep_mct)
totals_me3_ac <- get_total(me3_ac_obj, cells = cells_keep_mct)
totals_ac_ac <- get_total(ac_ac_obj, cells = cells_keep_mct)

in_me3_peak_me3_me3 <- get_in_peaks(obj = me3_me3_obj, regions = h3k27me3)
in_me3_peak_ac_ac <- get_in_peaks(obj = ac_ac_obj, regions = h3k27me3)
in_me3_peak_me3_ac <- get_in_peaks(obj = me3_ac_obj, regions = h3k27me3)
in_me3_peak_ac_me3 <- get_in_peaks(obj = ac_me3_obj, regions = h3k27me3)

in_ac_peak_me3_me3 <- get_in_peaks(obj = me3_me3_obj, regions = h3k27ac)
in_ac_peak_ac_ac <- get_in_peaks(obj = ac_ac_obj, regions = h3k27ac)
in_ac_peak_me3_ac <- get_in_peaks(obj = me3_ac_obj, regions = h3k27ac)
in_ac_peak_ac_me3 <- get_in_peaks(obj = ac_me3_obj, regions = h3k27ac)

me3_frip <- data.frame(
  cell = names(totals_me3_me3),
  total = totals_me3_me3 + (totals_ac_me3/2) + (totals_me3_ac/2),
  in_peak = in_me3_peak_me3_me3 + (in_me3_peak_me3_ac/2) + (in_me3_peak_ac_me3/2),
  assay = "H3K27me3",
  peak = "H3K27me3"
)
me3_frip$FRiP <- me3_frip$in_peak / me3_frip$total

ac_frip <- data.frame(
  cell = names(totals_ac_ac),
  total = totals_ac_ac + (totals_ac_me3/2) + (totals_me3_ac/2),
  in_peak = in_ac_peak_ac_ac + (in_ac_peak_me3_ac/2) + (in_ac_peak_ac_me3/2),
  assay = "H3K27ac",
  peak = "H3K27ac"
)
ac_frip$FRiP <- ac_frip$in_peak / ac_frip$total

df <- rbind(me3_frip, ac_frip)

p <- ggplot(df, aes(x = assay, y = FRiP, fill = assay)) +
  geom_boxplot(outlier.size = 0.5) +
  scale_fill_manual(values = colormap) +
  xlab("Antibody") +
  ggtitle("multiCUT&Tag FRiP") +
  theme_bw() +
  theme(legend.position = "none")

ggsave(filename = "plots/hek_k562/multi_cuttag_frip.png", plot = p, height = 4, width = 3)

saveRDS(object = df, file = "objects/frip_mct.rds")
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
library(Seurat)
library(Signac)
library(EnsDb.Hsapiens.v86)
library(BSgenome.Hsapiens.UCSC.hg38)
library(future)
library(ggplot2)
library(patchwork)

args <- commandArgs(trailingOnly = TRUE)
threads <- args[[1]]
outfile <- args[[2]]
fpath <- args[[3]]

plan("multicore", workers = as.integer(threads))
options(future.globals.maxSize = Inf)

# annotation
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"
genome.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:23]

fragfiles <- list.files(
  path = fpath,
  pattern = "*.bed.gz$",
  full.names = TRUE
)
fname <- gsub(pattern = paste0(fpath, "/"),
              replacement = "",
              x = fragfiles)
fname <- gsub(pattern = ".bed.gz",
              replacement = "",
              x = fname)
names(fragfiles) <- fname

total_counts <- CountFragments(fragments = as.list(fragfiles))
cells.keep <- total_counts[total_counts$frequency_count > 150, "CB"]

create_assay <- function(frag, cells, genome.use) {
  frag_obj <- CreateFragmentObject(
    path = frag,
    cells = cells
  )
  counts <- AggregateTiles(
    object = frag_obj,
    cells = cells,
    min_counts = 1,
    binsize = 10000,
    genome = genome.use
  )
  return(counts)
}

all.counts <- lapply(
  X = fragfiles,
  FUN = create_assay,
  cells = cells.keep,
  genome.use = genome.use
)

# combine matrices for same barcode pairs
names(all.counts) <- fname

all.cells <- lapply(all.counts, colnames)
common.cells <- Reduce(intersect, all.cells)
all.counts <- lapply(all.counts, function(x) x[, common.cells])
combined.matrix <- do.call(what = rbind, args = all.counts)

# create seurat object
obj <- CreateSeuratObject(
    counts = combined.matrix,
    min.cells = 10,
    assay = "all"
)

obj <- obj[, obj$nCount_all < 10000 & obj$nCount_all > 100]

for (i in seq_along(all.counts)) {
  mat <- all.counts[[i]][, colnames(obj)]
  assay <- CreateChromatinAssay(
    counts = mat,
    min.features = -1,
    fragments = fragfiles[[i]],
    annotation = annot
  )
  obj[[names(all.counts)[i]]] <- assay
}

obj <- subset(obj, subset = nCount_K27ac > 75 &
                nCount_K27me > 150 &
                nCount_Pol2 > 100)
# process
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 50)
obj <- RunSVD(obj)
obj <- RunUMAP(obj, reduction = 'lsi', dims = 2:10)

# create lsi, umap from each assay
DefaultAssay(obj) <- "K27ac"
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 10)
obj <- RunSVD(obj, reduction.name = "lsi.k27ac")
obj <- RunUMAP(obj, reduction = 'lsi.k27ac', reduction.name = "umap.k27ac", dims = 2:10)

DefaultAssay(obj) <- "K27me"
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 10)
obj <- RunSVD(obj, reduction.name = "lsi.k27me")
obj <- RunUMAP(obj, reduction = 'lsi.k27me', reduction.name = "umap.k27me", dims = 2:10)

DefaultAssay(obj) <- "Pol2"
obj <- RunTFIDF(obj)
obj <- FindTopFeatures(obj, min.cutoff = 10)
obj <- RunSVD(obj, reduction.name = "lsi.pol2")
obj <- RunUMAP(obj, reduction = 'lsi.pol2', reduction.name = "umap.pol2", dims = 2:10)

# WNN
obj <- FindMultiModalNeighbors(
  object = obj,
  reduction.list = list("lsi.k27ac", "lsi.k27me", "lsi.pol2"), 
  dims.list = list(2:10, 2:10, 2:10)
)

obj <- RunUMAP(
  object = obj,
  nn.name = "weighted.nn",
  reduction.name = "wnn.umap"
)

obj <- FindClusters(
  object = obj,
  graph.name = "wsnn",
  algorithm = 3,
  resolution = 0.05
)

obj <- RenameIdents(obj, list("0" = "K562", "1" = "HEK"))
obj$celltype <- Idents(obj)

# save object
saveRDS(object = obj, file = outfile)

# write cell names for vireo
writeLines(colnames(obj), "data/HEK_K562_sc/cells.txt")

# split fragment files by cell type
hek <- WhichCells(object = obj, idents = "HEK")
k562 <- WhichCells(object = obj, idents = "K562")

filter_chrom <- function(infile, outfile, chrom.keep) {
  # open input and output files
  con <- gzfile(infile, 'rb')
  outf <- file(description = outfile, "w")

  # read line by line, writing to output file
  while (TRUE) {
    line = readLines(con, n = 1)
    if (length(line) == 0) {
      break
    } else if (substring(text = line, first = 1, last = 1) == "#") {
      next # skip header
    } else {
      # check chromosome
      fields <- unlist(x = strsplit(x = line, "\t"))
      if (!(fields[[1]] %in% chrom.keep)) {
        next
      } else {
        writeLines(text = line, con = outf)
      }
    }
  }
  close(con)
  close(outf)
  system(
    command = paste0("bgzip ", outfile),
    wait = TRUE,
    ignore.stderr = FALSE,
    ignore.stdout = FALSE
  )
  system(
    command = paste0("tabix -p bed ", paste0(outfile, '.gz')),
    wait = TRUE,
    ignore.stderr = FALSE,
    ignore.stdout = FALSE
  )
}

chr.use <- names(genome.use)
for (i in seq_along(fragfiles)) {
  outf <- paste0("data/HEK_K562_sc/split/",
                 names(fragfiles)[[i]])
  FilterCells(
    fragments = fragfiles[[i]],
    cells = k562,
    outfile = paste0(outf, "_k562.bed.gz")
  )
  FilterCells(
    fragments = fragfiles[[i]],
    cells = hek,
    outfile = paste0(outf, "_hek.bed.gz")
  )

  # filter chromosomes
  filter_chrom(
    infile = paste0(outf, "_k562.bed.gz"),
    outfile = paste0(outf, "_k562_filter.bed"),
    chrom.keep = chr.use
  )
  filter_chrom(
    infile = paste0(outf, "_hek.bed.gz"),
    outfile = paste0(outf, "_hek_filter.bed"),
    chrom.keep = chr.use
  )
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
library(Signac)
library(Seurat)
library(EnsDb.Hsapiens.v86)

annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"

me3 <- list("data/henikoff/GSM5034342_K27me3_R1_PBMC.fragments.HG38.tsv.gz",
            "data/henikoff/GSM5034343_K27me3_R2_PBMC.fragments.HG38.tsv.gz")
ac <- "data/henikoff/GSM5034344_K27ac_PBMC.fragments.HG38.tsv.gz"

# count fragments per cell
frags_me3_1 <- CountFragments(me3[[1]])
frags_me3_2 <- CountFragments(me3[[2]])
frags_ac <- CountFragments(ac)

min_frags <- 500
cells_me3_1 <- frags_me3_1[frags_me3_1$frequency_count > min_frags, "CB"]
cells_me3_2 <- frags_me3_2[frags_me3_2$frequency_count > min_frags, "CB"]
cells_ac <- frags_ac[frags_ac$frequency_count > min_frags, "CB"]

f_me3_1 <- CreateFragmentObject(me3[[1]], cells = cells_me3_1)
f_me3_2 <- CreateFragmentObject(me3[[2]], cells = cells_me3_2)
f_ac <- CreateFragmentObject(ac, cells = cells_ac)

me3_counts_1 <- FeatureMatrix(
  fragments = f_me3_1,
  features = granges(pbmc[['me3']]),
  cells = cells_me3_1
)
me3_counts_2 <- FeatureMatrix(
  fragments = f_me3_2,
  features = granges(pbmc[['me3']]),
  cells = cells_me3_2
)
ac_counts <- FeatureMatrix(
  fragments = f_ac,
  features = granges(pbmc[['ac']]),
  cells = cells_ac
)

# change cell barcodes to add batch info
colnames(me3_counts_1) <- paste0(colnames(me3_counts_1), ":1")
colnames(me3_counts_2) <- paste0(colnames(me3_counts_2), ":2")
names(f_me3_1@cells) <- paste0(names(f_me3_1@cells), ":1")
names(f_me3_2@cells) <- paste0(names(f_me3_2@cells), ":2")

me3 <- CreateSeuratObject(
  counts = CreateChromatinAssay(
    counts = cbind(me3_counts_1, me3_counts_2),
    fragments = list(f_me3_1, f_me3_2),
    annotation = annot
  ),
  names.field = 2,
  names.delim = ":",
  assay = "me3",
  meta.data = md
)

me3 <- FindTopFeatures(me3)
me3 <- RunTFIDF(me3)
me3 <- RunSVD(me3, scale.embeddings = TRUE, reduction.name = 'lsi.me3')
me3 <- RunUMAP(me3, reduction = 'lsi.me3', dims = 2:30, reduction.name = 'umap.me3')

saveRDS(object = me3, file = "objects/henikoff/me3_filt.rds")

ac <- CreateSeuratObject(
  counts = CreateChromatinAssay(
    counts = ac_counts,
    fragments = ac,
    annotation = annot
  ),
  assay = "ac",
  meta.data = md_ac
)

ac <- FindTopFeatures(ac)
ac <- RunTFIDF(ac)
ac <- RunSVD(ac, scale.embeddings = TRUE, reduction.name = 'lsi.ac')
ac <- RunUMAP(ac, reduction = 'lsi.ac', dims = 2:30, reduction.name = 'umap.ac')

saveRDS(object = ac, file = "objects/henikoff/ac_filt.rds")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
library(Signac)
library(ggplot2)

bigWigs <- list(
  "H3K27me3 (multi)" = "data/pbmc_bulk/mapped/plex/me3.bw",
  "H3K27ac (multi)" = "data/pbmc_bulk/mapped/plex/ac.bw",
  "H3K27me3 (mono)" = "data/pbmc_bulk/mapped/mono/me3.bw",
  "H3K27ac (mono)" = "data/pbmc_bulk/mapped/mono/ac.bw"
)

bw <- BigwigTrack(
  region = "chr1-156630171-156920171",
  bigwig = bigWigs,
  y_label = "Normalized coverage",
  ymax = "q90",
  bigwig.scale = "common",
  smooth = 1000
) + theme(legend.position = "none")

bw <- bw + scale_fill_manual(values = c("#D3145A", "#F98401",
                                        "#676767", "#676767"))

ggsave("plots/pbmc_bulk/bulk_covplot_pbmc.png", plot = bw, height = 5, width = 10, units = "in")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
library(ggplot2)
library(GenomicRanges)
library(Rsamtools)

getFrip <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  in_peak <- countTabix(tabix.file, param = regions)
  total_in_peak <- sum(unlist(in_peak))

  # close connection
  close(con = tabix.file)

  total <- system(
    command = paste0("gzip -dc ", fragfile, " | wc -l"),
    wait = TRUE,
    intern = TRUE
  )
  total <- as.numeric(total)

  frip <- total_in_peak / total
  return(frip)
}


# bulk fragments
me3_mono <- "data/pbmc_bulk/mapped/mono/me3.bed.gz"
ac_mono <- "data/pbmc_bulk/mapped/mono/ac.bed.gz"
me3_plex <- "data/pbmc_bulk/mapped/plex/me3.bed.gz"
ac_plex <- "data/pbmc_bulk/mapped/plex/ac.bed.gz"

regions_ac <- read.table("data/encode/ENCFF832RWT.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)

regions_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz", sep = "\t",
                          col.names = c("chromosome", "start", "end", "name", "score",
                                        "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)

frip_me3_mono_me3_peak <- getFrip(fragfile = me3_mono, regions = regions_me3)
frip_me3_plex_me3_peak <- getFrip(fragfile = me3_plex, regions = regions_me3)
frip_ac_mono_me3_peak <- getFrip(fragfile = ac_mono, regions = regions_me3)
frip_ac_plex_me3_peak <- getFrip(fragfile = ac_plex, regions = regions_me3)

frip_me3_mono_ac_peak <- getFrip(fragfile = me3_mono, regions = regions_ac)
frip_me3_plex_ac_peak <- getFrip(fragfile = me3_plex, regions = regions_ac)
frip_ac_mono_ac_peak <- getFrip(fragfile = ac_mono, regions = regions_ac)
frip_ac_plex_ac_peak <- getFrip(fragfile = ac_plex, regions = regions_ac)

me3_peak <- data.frame(
  frip = c(frip_me3_mono_me3_peak, frip_me3_plex_me3_peak, frip_ac_mono_me3_peak, frip_ac_plex_me3_peak),
  assay = c("H3K27me3_mono", "H3K27me3_plex", "H3K27ac_mono", "H3K27ac_plex")
)
me3_peak$assay <- factor(me3_peak$assay, levels = c("H3K27me3_plex", "H3K27ac_plex", "H3K27me3_mono", "H3K27ac_mono"))

p1 <- ggplot(me3_peak, aes(assay, frip, fill = assay)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("#D3145A", "#F98401", "#676767", "#676767")) +
  theme_bw() +
  theme(legend.position = "none") +
  ylab("FRiP") +
  ggtitle("Fraction of reads in H3K27me3 peaks")

ac_peak <- data.frame(
  frip = c(frip_me3_mono_ac_peak, frip_me3_plex_ac_peak, frip_ac_mono_ac_peak, frip_ac_plex_ac_peak),
  assay = c("H3K27me3_mono", "H3K27me3_plex", "H3K27ac_mono", "H3K27ac_plex")
)
ac_peak$assay <- factor(ac_peak$assay, levels = c("H3K27me3_plex", "H3K27ac_plex", "H3K27me3_mono", "H3K27ac_mono"))

p2 <- ggplot(ac_peak, aes(assay, frip, fill = assay)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = c("#D3145A", "#F98401", "#676767", "#676767")) +
  theme_bw() +
  theme(legend.position = "none") +
  ylab("FRiP") +
  ggtitle("Fraction of reads in H3K27ac peaks")

ggsave(filename = "plots/pbmc_bulk/frip_me3.png", plot = p1, height = 4, width = 4)
ggsave(filename = "plots/pbmc_bulk/frip_ac.png", plot = p2, height = 4, width = 4)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
library(ggplot2)
library(patchwork)
library(GenomicRanges)
library(Rsamtools)


me3_mono <- "data/pbmc_bulk/mapped/mono/me3.bed.gz"
ac_mono <- "data/pbmc_bulk/mapped/mono/ac.bed.gz"
me3_plex <- "data/pbmc_bulk/mapped/plex/me3.bed.gz"
ac_plex <- "data/pbmc_bulk/mapped/plex/ac.bed.gz"

regions_ac <- read.table("data/encode/ENCFF832RWT.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
regions_ac$peak <- "H3K27ac"

regions_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz", sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
regions_me3$peak <- "H3K27me3"

regions <- c(regions_me3, regions_ac)

QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # get total fragments for normalization
  # just read whole thing
  f <- read.table(fragfile, sep = "\t", header = FALSE)
  total <- nrow(f)
  rm(f)
  gc()

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / total) * 10^6
  return(cmp)
}

get_cod_expression <- function(m) {
  eq <- substitute(~~italic(R)^2~"="~r2, 
                   list(r2 = format(summary(m)$r.squared, digits = 2)))
  cod <- as.expression(eq)
  return(cod)
}

ac_plex_cov <- QuantifyRegions(ac_plex, regions)
ac_mono_cov <- QuantifyRegions(ac_mono, regions)
me3_plex_cov <- QuantifyRegions(me3_plex, regions)
me3_mono_cov <- QuantifyRegions(me3_mono, regions)

mat <- cbind(ac_plex_cov, ac_mono_cov, me3_plex_cov, me3_mono_cov)

mat <- as.data.frame(mat)
mat$region <- regions$peak
colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

mat <- mat[sample(nrow(mat)), ]

# p-m
m.use <- mat[mat$region == "H3K27ac", ]
ac_plex_mono <- ggplot(
  data = m.use,
  mapping = aes(ac_plex_cov, ac_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 800)) + ylim(c(0, 800)) +
  xlab("H3K27ac (multiple antibody)") +
  ylab("H3K27ac (single antibody)") +
  ggtitle(get_cod_expression(lm(ac_plex_cov ~ ac_mono_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

m.use <- mat[mat$region == "H3K27me3", ]
me3_plex_mono <- ggplot(
  data = m.use,
  mapping = aes(me3_plex_cov, me3_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 100)) + ylim(c(0, 100)) +
  xlab("H3K27me3 (multiple antibody)") +
  ylab("H3K27me3 (single antibody)") +
  ggtitle(get_cod_expression(lm(me3_plex_cov ~ me3_mono_cov, data = m.use))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ac_plex_me3_mono <- ggplot(
  data = mat,
  mapping = aes(ac_plex_cov, me3_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (multiple antibody)") +
  ylab("H3K27me3 (single antibody)") +
  ggtitle(get_cod_expression(lm(ac_plex_cov ~ me3_mono_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ac_mono_me3_plex <- ggplot(
  data = mat,
  mapping = aes(ac_mono_cov, me3_plex_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single antibody)") +
  ylab("H3K27me3 (multiple antibody)") +
  ggtitle(get_cod_expression(lm(ac_mono_cov ~ me3_plex_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# m-m
ac_mono_me3_mono <- ggplot(
  data = mat,
  mapping = aes(ac_mono_cov, me3_mono_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (single antibody)") +
  ylab("H3K27me3 (single antibody)") +
  ggtitle(get_cod_expression(lm(ac_mono_cov ~ me3_mono_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

# p-p
ac_plex_me3_plex <- ggplot(
  data = mat,
  mapping = aes(ac_plex_cov, me3_plex_cov, color = region)) +
  geom_point(size = 0.1) +
  scale_color_manual(values = colormap) +
  theme_classic() +
  xlim(c(0, 400)) + ylim(c(0, 400)) +
  xlab("H3K27ac (multiple antibody)") +
  ylab("H3K27me3 (multiple antibody)") +
  ggtitle(get_cod_expression(lm(ac_plex_cov ~ me3_plex_cov, data = mat))) +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

pp <- (ac_mono_me3_mono | ac_mono_me3_plex | ac_plex_me3_mono | ac_plex_me3_plex | ac_plex_mono | me3_plex_mono) & theme(legend.position = "none")
ggsave(filename = "plots/pbmc/bulk_scatter.png", plot = pp, height = 3.5, width = 16, dpi = 600)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
library(Signac)
library(Seurat)
library(ggplot2)
library(patchwork)
library(dplyr)


args <- commandArgs(trailingOnly = TRUE)
ntt <- readRDS(args[[1]])
ct_ac <- readRDS(args[[2]])
ct_me <- readRDS(args[[3]])
pbmc_sc <- readRDS(args[[4]])

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")
mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752")

# QC plots
me3_frag <- Fragments(ntt[['me3']])[[1]]@path
ac_frag <- Fragments(ntt[['ac']])[[1]]@path

tot_me3 <- CountFragments(
  fragments = me3_frag,
  cells = colnames(ntt)
)
tot_me3$mark <- "H3K27me3"
tot_ac <- CountFragments(
  fragments = ac_frag,
  cells = colnames(ntt)
)
tot_ac$mark <- "H3K27ac"

all_counts <- rbind(tot_me3, tot_ac)

mean(tot_me3$frequency_count) # 2854
mean(tot_ac$frequency_count) # 412
sd(tot_me3$frequency_count) # 2953
sd(tot_ac$frequency_count) # 356

# pbmc replicate
me3_frag_2 <- Fragments(pbmc_sc[['me3']])[[1]]@path
ac_frag_2 <- Fragments(pbmc_sc[['ac']])[[1]]@path

tot_me3_2 <- CountFragments(
  fragments = me3_frag_2,
  cells = colnames(pbmc_sc)
)
tot_me3_2$mark <- "H3K27me3"
tot_ac_2 <- CountFragments(
  fragments = ac_frag_2,
  cells = colnames(pbmc_sc)
)
tot_ac_2$mark <- "H3K27ac"

all_counts_2 <- rbind(tot_me3_2, tot_ac_2)

mean(tot_me3_2$frequency_count) # 670
mean(tot_ac_2$frequency_count) # 731
sd(tot_me3_2$frequency_count) # 1243
sd(tot_ac_2$frequency_count) # 1035

all_counts$dataset <- "scNTT-seq"
all_counts_2$dataset <- "scNTT-seq"

# ct henikoff
me3_frag_ct <- Fragments(ct_me[['tiles']])[[1]]@path
ac_frag_ct <- Fragments(ct_ac[["tiles"]])[[1]]@path

tot_me3_ct <- CountFragments(
  fragments = me3_frag_ct,
  cells = colnames(ct_me)
)
tot_me3_ct$mark <- "H3K27me3"
tot_ac_ct <- CountFragments(
  fragments = ac_frag_ct,
  cells = colnames(ct_ac)
)
tot_ac_ct$mark <- "H3K27ac"

all_counts_ct <- rbind(tot_me3_ct, tot_ac_ct)

mean(tot_me3_ct$frequency_count) # 519
mean(tot_ac_ct$frequency_count) # 283
sd(tot_me3_ct$frequency_count) # 383
sd(tot_ac_ct$frequency_count) # 272

all_counts_ct$dataset <- "scCUT&Tag"

all_counts <- rbind(all_counts, all_counts_ct)

p0 <- ggplot(all_counts, aes(mark, frequency_count, fill = dataset)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  ylab("Fragments") +
  xlab("Mark") +
  ggtitle("Total fragments per cell")

p01 <- ggplot(all_counts, aes(mark, reads_count, fill = dataset)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  ylab("Reads") +
  xlab("Mark") +
  ggtitle("Total reads per cell") +
  theme(legend.position = "none")

ggsave(filename = "plots/pbmc/fragments.png", plot = p01 | p0, height = 4, width = 7)

# replicate 2
p0 <- ggplot(all_counts_2, aes(mark, frequency_count, fill = mark)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  scale_fill_manual(values = colormap[1:2]) +
  ylab("Fragments") +
  xlab("Mark") +
  ggtitle("Total fragments per cell") +
  theme(legend.position = "none")

p01 <- ggplot(all_counts_2, aes(mark, reads_count, fill = mark)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() + 
  scale_fill_manual(values = colormap[1:2]) +
  ylab("Reads") +
  xlab("Mark") +
  ggtitle("Total reads per cell") +
  theme(legend.position = "none")

ggsave(filename = "plots/pbmc/fragments_rep2.png", plot = p01 | p0, height = 4, width = 7)


t1 <- TSSPlot(ntt, assay = "me3", group.by = "orig.ident") +
  ggtitle("H3K27me3") + scale_color_manual(values = colormap$H3K27me3)
t2 <- TSSPlot(ntt, assay = "ac", group.by = "orig.ident") +
  ggtitle("H3K27ac") + scale_color_manual(values = colormap$H3K27ac)
tt <- (t1 | t2) & ylim(c(0, 7))
ggsave(filename = "plots/pbmc/tss.png", plot = tt, height = 8, width = 5)

p1 <- DimPlot(ntt, reduction = "umap.me3", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("H3K27me3") + NoLegend()
p2 <- DimPlot(ntt, reduction = "umap.ac", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("H3K27ac") + NoLegend()
p3 <- DimPlot(ntt, reduction = "umap.adt", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("ADT") + NoLegend()
p4 <- DimPlot(ntt, reduction = "umap.wnn", label = TRUE, label.size = 4) + ggtitle("PBMCs: H3K27me3 + H3K27ac + protein") + theme_classic() + NoLegend() + ylab("UMAP 1") + xlab("UMAP 2") 

pp <- ((p1 / p2 / p3) | p4) + plot_layout(widths = c(1, 2))
ggsave(filename = "plots/pbmc/dimplots.png", plot = pp, height = 5, width = 7)

# wnn weight
v1 <- VlnPlot(ntt, c("ac.weight", "me3.weight", "ADT.weight"), ncol = 1, pt.size = 0)
ggsave(filename = "plots/pbmc/wnn_weight.png", plot = v1, height = 8, width = 12)

# adt sensitivity
df <- ntt[[]]
df$dataset <- "scNTT-seq"
df <- df[, c("nCount_ADT", "dataset")]
mean(df$nCount_ADT) # 690
sd(df$nCount_ADT)   # 613

df2 <- ct_me[[]]
df2$dataset <- "CUT&Tag-pro (H3K27me3)"
df2 <- df2[, c("nCount_ADT", "dataset")]

df3 <- ct_ac[[]]
df3$dataset <- "CUT&Tag-pro (H3K27ac)"
df3 <- df3[, c("nCount_ADT", "dataset")]

df <- rbind(df, df2, df3)

v2 <- ggplot(data = df, aes(dataset, nCount_ADT, fill = dataset)) +
  geom_violin(size = 0.2) +
  scale_y_log10() +
  theme_bw() +
  theme(axis.title.x=element_blank(),
        axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        legend.position = 'none') +
  ggtitle("ADT counts per cell")

ggsave(filename = "plots/pbmc/ncount_adt.png", plot = v2, height = 4, width = 2.5)

# featureplots adt
DefaultAssay(ntt) <- "ADT"
fp <- FeaturePlot(
  ntt,
  c("CD14", "CD19", "CD4", "CD8A", "IL2RB", "TFRC"),
  pt.size = 0.1,
  max.cutoff = 'q99',
  cols = c("lightgrey", "darkgreen"),
  ncol = 3,
  reduction = 'umap.wnn'
) & xlab("UMAP 1") & ylab("UMAP 2") & NoLegend()

ggsave(filename = "plots/pbmc/featureplots.png", plot = fp, height = 6, width = 8)

DefaultAssay(ntt) <- "me3"
cp <- CoveragePlot(
  object = ntt,
  region = "PAX5",
  assay = list("me3", "ac"),
  split.assays = FALSE,
  assay.scale = "separate",
  features = c("CD19"),
  expression.assay = "ADT",
  idents = c("CD14+ Mono", "B cell"),
  extend.upstream = -30000,
  extend.downstream = 10000,
  window = 500,
  peaks = FALSE
)
cp[[1]][[1]] <- cp[[1]][[1]] + scale_fill_manual(values = c(colormap$H3K27me3, colormap$H3K27ac))
ggsave(filename = "plots/pbmc/pax5_covplot.png", plot = cp, height = 3, width = 8)

# colorscale
cols <- colorRampPalette(c("lightgrey", "darkgreen"))(100)

png("plots/pbmc/colorbar.png", width = 10, height = 4, units = "in", res = 400)
image(matrix(1:100), col=cols)
dev.off()

cp <- CoveragePlot(
  object = ntt,
  region = "CD33",
  assay = list("me3", "ac"),
  split.assays = FALSE,
  assay.scale = "separate",
  features = c("CD33"),
  expression.assay = "ADT",
  idents = c("CD14+ Mono", "B cell"),
  extend.upstream = 2000,
  extend.downstream = 2000,
  window = 500,
  peaks = FALSE
)
cp[[1]][[1]] <- cp[[1]][[1]] + scale_fill_manual(values = c(colormap$H3K27me3, colormap$H3K27ac))
ggsave(filename = "plots/pbmc/cd33_covplot.png", plot = cp, height = 3, width = 8)

# run WNN on pair of chromatin assays, quantify how well structure matches protein neighbor graph

ntt <- FindMultiModalNeighbors(
  object = ntt,
  reduction.list = list("lsi.me3", "lsi.ac"),
  dims.list = list(2:30, 2:30),
  knn.graph.name = "wknn.chrom",
  weighted.nn.name = "wnn.chrom",
  modality.weight.name = "wnn.chrom"
)

# compute KNN purity using ADT-defined cell types, compare different graphs
knn_purity_graph <- function(graph, clusters) {
  # pre-existing graph, can't change k
  nn_purity <- vector(mode = "numeric", length = length(x = clusters))
  for (i in seq_len(length.out = nrow(x = graph))) {
    nbr <- names(which(graph[i, ] == 1))
    nn_purity[i] <- sum(clusters[nbr] == clusters[i]) / sum(graph[i, ])
  }
  return(nn_purity)
}

clusters <- ntt$celltype

ntt <- FindNeighbors(ntt, reduction = "lsi.me3", dims = 2:30, k.param = 100)
ntt <- FindNeighbors(ntt, reduction = "lsi.ac", dims = 2:30, k.param = 100)
ntt <- FindNeighbors(ntt, reduction = "pca", dims = 1:30, k.param = 100)

graph.me3 <- ntt[['me3_nn']]
graph.ac <- ntt[['ac_nn']]
graph.chrom <- ntt[['wknn.chrom']]
graph.adt <- ntt[['ADT_nn']]
graph.wnn <- ntt[['wknn']]

kme3 <- data.frame(
  purity = knn_purity_graph(graph = graph.me3, clusters = clusters),
  assay = "H3K27me3",
  celltype = clusters
)
kac <- data.frame(
  purity = knn_purity_graph(graph = graph.ac, clusters = clusters),
  assay = "H3K27ac",
  celltype = clusters
)
kchrom <- data.frame(
  purity = knn_purity_graph(graph = graph.chrom, clusters = clusters),
  assay = "H3K27me3 + H3K27ac",
  celltype = clusters
)
kadt <- data.frame(
  purity = knn_purity_graph(graph = graph.adt, clusters = clusters),
  assay = "ADT",
  celltype = clusters
)
kwnn <- data.frame(
  purity = knn_purity_graph(graph = graph.wnn, clusters = clusters),
  assay = "ADT + H3K27me3 + H3K27ac",
  celltype = clusters
)

kdf <- rbind(kme3, kac, kchrom, kadt)
kdf$assay <- factor(kdf$assay, levels = c("H3K27ac", "H3K27me3", "H3K27me3 + H3K27ac", "ADT"))

df <- kdf %>% 
  group_by(assay) %>% 
  mutate(total_025 = sum(purity < 0.25) / n()) %>% 
  select(assay, total_025) %>% 
  unique()

knn_p <- ggplot(df, aes(assay, total_025, fill = assay)) +
  geom_bar(stat = "identity") +
  theme_classic() +
  theme(legend.position = "none") +
  xlab("Assay") + ylab("Fraction of cells with \n<25% neighbors same celltype") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  scale_fill_manual(
    values = c(colormap$H3K27ac, colormap$H3K27me3, mixed$ma, "darkgreen")
  )

ggsave(filename = "plots/pbmc/knn_purity.pdf", plot = knn_p, height = 4, width = 3)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
library(ggplot2)
library(patchwork)

mat_ac <- read.table("data/pbmc_protein/bigwig/raw_ac.tsv", sep = "\t", header = TRUE, comment.char = "")
mat_ac <- mat_ac[, 4:ncol(mat_ac)] # remove region
mat_ac <- as.matrix(mat_ac)
mat_ac[is.nan(mat_ac)] <- 0
mat_ac <- as.data.frame(mat_ac)

mat_me3 <- read.table("data/pbmc_protein/bigwig/raw_me3.tsv", sep = "\t", header = TRUE, comment.char = "")
mat_me3 <- mat_me3[, 4:ncol(mat_me3)] # remove region
mat_me3 <- as.matrix(mat_me3)
mat_me3[is.nan(mat_me3)] <- 0
mat_me3 <- as.data.frame(mat_me3)

encode <- list("ENCFF569IPX.bigWig" = "NK_H3K27me3_chip",
               "ENCFF293ETP.bigWig" = "NK_H3K27ac_chip",
               "ENCFF842JLZ.bigWig" = "CD8_memory_H3K27me3_chip",
               "ENCFF611GRL.bigWig" = "CD8_memory_H3K27ac_chip",
               "ENCFF181OXO.bigWig" = "CD4_H3K27ac_chip",
               "ENCFF811VAX.bigWig" = "CD4_H3K27me3_chip",
               "ENCFF526VJO.bigWig" = "CD8_H3K27ac_chip",
               "ENCFF499VWN.bigWig" = "CD8_H3K27me3_chip",
               "ENCFF777EGG.bigWig" = "CD14_mono_H3K27me3_chip",
               "ENCFF601NLG.bigWig" = "CD14_mono_H3K27ac_chip",
               "ENCFF951ZBV.bigWig" = "B_cell_H3K27ac_chip",
               "ENCFF046VLL.bigWig" = "B_cell_H3K27me3_chip",
               "ENCFF085YMB.bigWig" = "CMP_CD34_H3K27me3_chip",
               "ENCFF064JOI.bigWig" = "CMP_CD34_H3K27ac_chip")

process_matrix <- function(mat, encode, method = "pearson") {
    encode_mat <- colnames(mat) %in% names(encode)
    colnames(mat)[encode_mat] <- unname(encode[colnames(mat)[encode_mat]])
    colnames(mat) <- gsub(".bw", "", colnames(mat), fixed = TRUE)

    # work out me3 or ac
    if (any(grepl("me3", colnames(mat)))) {
        mat <- mat[, c("B_cell_me3", "B_cell_H3K27me3_chip",
               "NK_me3", "NK_H3K27me3_chip",
               "CD14_Mono_me3", "CD14_mono_H3K27me3_chip",
               "CD8_T_cell_me3", "CD8_H3K27me3_chip",
               "CD4_T_cell_me3", "CD4_H3K27me3_chip",
               "Late_erythroid_me3", "CMP_CD34_H3K27me3_chip")]
    } else {
        mat <- mat[, c("B_cell_ac", "B_cell_H3K27ac_chip",
               "NK_ac", "NK_H3K27ac_chip",
               "CD14_Mono_ac", "CD14_mono_H3K27ac_chip",
               "CD8_T_cell_ac", "CD8_H3K27ac_chip",
               "CD4_T_cell_ac", "CD4_H3K27ac_chip",
               "Late_erythroid_ac", "CMP_CD34_H3K27ac_chip")]
    }
    all_cor <- cor(mat, method = method)
    ac <- reshape2::melt(all_cor)
    ac <- ac[ac$Var1 %in% encode, ]
    ac <- ac[ac$Var2 %in% setdiff(colnames(mat), encode), ]
    ac$`Pearson correlation` <- ac$value
    return(ac)
}

me3 <- process_matrix(mat=mat_me3, encode=encode, method = "spearman")
ac <- process_matrix(mat=mat_ac, encode=encode, method = "spearman")

cells_keep <- c("CD14_Mono_me3", "Late_erythroid_me3", "B_cell_me3")
encode_keep <- c("B_cell_H3K27me3_chip", "CD14_mono_H3K27me3_chip", "CMP_CD34_H3K27me3_chip")

me3 <- me3[me3$Var2 %in% cells_keep, ]
me3 <- me3[me3$Var1 %in% encode_keep, ]

cells_keep <- c("CD14_Mono_ac", "Late_erythroid_ac", "B_cell_ac")
encode_keep <- c("B_cell_H3K27ac_chip", "CD14_mono_H3K27ac_chip", "CMP_CD34_H3K27ac_chip")

ac <- ac[ac$Var2 %in% cells_keep, ]
ac <- ac[ac$Var1 %in% encode_keep, ]

replacement_encode <- list("B_cell_H3K27ac_chip" = "B cell",
                           "CD14_mono_H3K27ac_chip" = "CD14 Monocyte",
                           "CMP_CD34_H3K27ac_chip" = "CD34+ CMP")

replacement_sc <- list("B_cell_ac" = "B cell",
                       "CD14_Mono_ac" = "CD14 Monocyte",
                       "Late_erythroid_ac" = "Late_erythroid")

ac$Var1 <- as.character(replacement_encode[as.character(ac$Var1)])
ac$Var2 <- as.character(replacement_sc[as.character(ac$Var2)])

replacement_encode <- list("B_cell_H3K27me3_chip" = "B cell",
                           "CD14_mono_H3K27me3_chip" = "CD14 Monocyte",
                           "CMP_CD34_H3K27me3_chip" = "CD34+ CMP")

replacement_sc <- list("B_cell_me3" = "B cell",
                       "CD14_Mono_me3" = "CD14 Monocyte",
                       "Late_erythroid_me3" = "Late_erythroid")

me3$Var1 <- as.character(replacement_encode[as.character(me3$Var1)])
me3$Var2 <- as.character(replacement_sc[as.character(me3$Var2)])

p1 <- ggplot(me3, aes(Var1, Var2, fill = `Pearson correlation`)) +
  geom_tile() +
  theme_classic() +
  scale_fill_distiller(palette = "RdYlBu") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  xlab("Bulk-cell ChIP-seq") +
  ylab("Single-cell multiplexed NTT-seq") +
  ggtitle("H3K27me3") +
  theme(legend.position = "none")

p2 <- ggplot(ac, aes(Var1, Var2, fill = `Pearson correlation`)) +
  geom_tile() +
  theme_classic() +
  scale_fill_distiller(palette = "RdYlBu") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  xlab("Bulk-cell ChIP-seq") +
  ylab("Single-cell multiplexed NTT-seq") +
  ggtitle("H3K27ac")

pp <- (p1 / p2)

ggsave(filename = "plots/pbmc/encode_cor_spearman.png", plot = pp, height = 6, width = 5)

# correlation between replicates
mat <- read.table("data/pbmc_protein/replicate_cor.tsv", sep = "\t", header = TRUE, comment.char = "")
mat <- mat[, 4:ncol(mat)] # remove region
mat <- as.matrix(mat)
mat[is.nan(mat)] <- 0
mat <- as.data.frame(mat)

colnames(mat) <- c(
  "Rep. 1 H3K27me3",
  "Rep. 1 H3K27ac",
  "Rep. 2 H3K27me3",
  "Rep. 2 H3K27ac"
)

mat <- mat[, c("Rep. 1 H3K27me3", "Rep. 2 H3K27me3", "Rep. 1 H3K27ac", "Rep. 2 H3K27ac")]

all_cor <- cor(mat, method = "pearson")
ac <- reshape2::melt(all_cor)
ac$`Pearson correlation` <- ac$value

p1 <- ggplot(ac, aes(Var1, Var2, fill = `Pearson correlation`)) +
  geom_tile() +
  theme_classic() +
  scale_fill_distiller(palette = "RdBu") +
  scale_x_discrete(guide = guide_axis(angle = 45)) +
  xlab("") +
  ylab("")

ggsave("plots/pbmc/replicate_correlation.png", plot = p1, height = 3, width = 5)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
library(Signac)
library(Seurat)
library(GenomicRanges)
library(ggplot2)

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401")

pbmc <- readRDS("objects/pbmc_protein.rds")
pbmc_sc <- readRDS("objects/pbmc_sc.rds")
henikoff_me3_filt <- readRDS("objects/henikoff/me3_filt.rds")
henikoff_ac_filt <- readRDS("objects/henikoff/ac_filt.rds")

# ENCODE peaks
pk_ac <- read.table("data/encode/ENCFF832RWT.bed.gz")
pk_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz")

pk_ac <- makeGRangesFromDataFrame(df = pk_ac, seqnames.field = "V1", start.field = "V2", end.field = "V3")
pk_me3 <- makeGRangesFromDataFrame(df = pk_me3, seqnames.field = "V1", start.field = "V2", end.field = "V3")

# total fragments for each dataset

get_total <- function(frags, cells) {
  tot <- CountFragments(fragments = frags, cells = cells)
  totals <- tot$frequency_count
  names(totals) <- tot$CB
  return(totals)
}

# pbmc
totals_pbmc_ac <- get_total(Fragments(pbmc[['ac']])[[1]]@path, colnames(pbmc))
totals_pbmc_me3 <- get_total(Fragments(pbmc[['me3']])[[1]]@path, cells = colnames(pbmc))

# new replicate
totals_pbmc_ac_n <- get_total(Fragments(pbmc_sc[['ac']])[[1]]@path, cells = colnames(pbmc_sc))
totals_pbmc_me3_n <- get_total(Fragments(pbmc_sc[['me3']])[[1]]@path, cells = colnames(pbmc_sc))

# henikoff
r1_cells_filt <- henikoff_me3_filt$orig.ident == '1'
r2_cells_filt <- henikoff_me3_filt$orig.ident == '2'
totals_h_me3_1_f <- get_total(Fragments(henikoff_me3_filt)[[1]]@path, cells = henikoff_me3_filt$barcode[r1_cells_filt])
totals_h_me3_2_f <- get_total(Fragments(henikoff_me3_filt)[[2]]@path, cells = henikoff_me3_filt$barcode[r2_cells_filt])
names(totals_h_me3_1_f) <- paste0(names(totals_h_me3_1_f), ":1")
names(totals_h_me3_2_f) <- paste0(names(totals_h_me3_2_f), ":2")
totals_h_me3_f <- c(totals_h_me3_1_f, totals_h_me3_2_f)

totals_h_ac_f <- get_total(Fragments(henikoff_ac_filt)[[1]]@path, cells = colnames(henikoff_ac_filt))

# total counts in peak regions
get_in_peaks <- function(obj, regions) {
  counts <- FeatureMatrix(
    fragments = Fragments(obj),
    features = regions,
    cells = colnames(obj)
  )
  return(colSums(counts))
}

# ntt
in_peak_pbmc_ac_ac <- get_in_peaks(obj = pbmc[['ac']], regions = pk_ac)
in_peak_pbmc_ac_me <- get_in_peaks(obj = pbmc[['ac']], regions = pk_me3)
in_peak_pbmc_me_me <- get_in_peaks(obj = pbmc[['me3']], regions = pk_me3)
in_peak_pbmc_me_ac <- get_in_peaks(obj = pbmc[['me3']], regions = pk_ac)

# rep2
in_peak_pbmc_ac_ac_n <- get_in_peaks(obj = pbmc_sc[['ac']], regions = pk_ac)
in_peak_pbmc_ac_me_n <- get_in_peaks(obj = pbmc_sc[['ac']], regions = pk_me3)
in_peak_pbmc_me_me_n <- get_in_peaks(obj = pbmc_sc[['me3']], regions = pk_me3)
in_peak_pbmc_me_ac_n <- get_in_peaks(obj = pbmc_sc[['me3']], regions = pk_ac)

# henikoff filt
in_peak_h_ac_ac_f <- get_in_peaks(obj = henikoff_ac_filt, regions = pk_ac)
in_peak_h_ac_me_f <- get_in_peaks(obj = henikoff_ac_filt, regions = pk_me3)
in_peak_h_me_me_f <- get_in_peaks(obj = henikoff_me3_filt, regions = pk_me3)
in_peak_h_me_ac_f <- get_in_peaks(obj = henikoff_me3_filt, regions = pk_ac)

# frip
make_df <- function(totals, inpeak, assay, pk, dataset) {
  d <- data.frame(
    cell = names(totals),
    total = totals,
    in_peak = inpeak[names(totals)],
    assay = assay,
    peak = pk,
    dataset = dataset
  )
  d$FRIP <- d$in_peak / d$total
  return(d)
}

df <- rbind(
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_ac, "H3K27ac", "H3K27ac", "scNTT-seq"),
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_me, "H3K27ac", "H3K27me3", "scNTT-seq"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_me, "H3K27me3", 'H3K27me3', "scNTT-seq"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_ac, "H3K27me3", "H3K27ac", "scNTT-seq"),

  make_df(totals_h_ac_f, in_peak_h_ac_ac_f, "H3K27ac", "H3K27ac", "scCUT&Tag"),
  make_df(totals_h_ac_f, in_peak_h_ac_me_f, "H3K27ac", "H3K27me3", "scCUT&Tag"),
  make_df(totals_h_me3_f, in_peak_h_me_me_f, "H3K27me3", "H3K27me3", "scCUT&Tag"),
  make_df(totals_h_me3_f, in_peak_h_me_ac_f, "H3K27me3", "H3K27ac", "scCUT&Tag")
)

df2 <- rbind(
  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_ac_n, "H3K27ac", "H3K27ac", "scNTT-seq"),
  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_me_n, "H3K27ac", "H3K27me3", "scNTT-seq"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_me_n, "H3K27me3", 'H3K27me3', "scNTT-seq"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_ac_n, "H3K27me3", "H3K27ac", "scNTT-seq"),
)

df_filt <- df[df$assay == df$peak, ]
df_filt <- df_filt[!is.na(df_filt$FRIP), ]

p <- ggplot(df_filt, aes(x = assay, y = FRIP, fill = dataset)) +
  geom_boxplot(outlier.size = 0.5) +
  xlab("Mark") +
  theme_bw() +
  ggtitle("FRiP") +
  theme(legend.position = 'none')

ggsave("plots/pbmc/frip_pbmc.png", plot = p, height = 4, width = 2.5)


p2 <- ggplot(df2, aes(x = assay, y = FRIP, fill = assay)) +
  geom_boxplot(outlier.size = 0.5) +
  scale_fill_manual(values = colormap) +
  xlab("Mark") +
  theme_bw() +
  ggtitle("FRiP") +
  theme(legend.position = 'none')
ggsave("plots/pbmc/frip_pbmc_rep2.png", plot = p2, height = 4, width = 2.5)

saveRDS(object = df_filt, file = "data/pbmc_protein/frip.rds")

df1 <- df_filt[df_filt$dataset == "scNTT-seq", ]

mean(df1[df1$peak == "H3K27ac", "FRIP"])  # 0.21
sd(df1[df1$peak == "H3K27ac", "FRIP"])  # 0.09
mean(df1[df1$peak == "H3K27me3", "FRIP"])  # 0.11
sd(df1[df1$peak == "H3K27me3", "FRIP"])  # 0.05

mean(df2[df2$peak == "H3K27ac", "FRIP"])  # 0.28
sd(df2[df2$peak == "H3K27ac", "FRIP"])  # 0.08
mean(df2[df2$peak == "H3K27me3", "FRIP"])  # 0.10
sd(df2[df2$peak == "H3K27me3", "FRIP"])  # 0.06


# revision
df3 <- rbind(
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_ac, "H3K27ac", "H3K27ac", "scNTT-seq (PBMC 1)"),
  make_df(totals_pbmc_ac, in_peak_pbmc_ac_me, "H3K27ac", "H3K27me3", "scNTT-seq (PBMC 1)"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_me, "H3K27me3", 'H3K27me3', "scNTT-seq (PBMC 1)"),
  make_df(totals_pbmc_me3, in_peak_pbmc_me_ac, "H3K27me3", "H3K27ac", "scNTT-seq (PBMC 1)"),

  make_df(totals_h_ac_f, in_peak_h_ac_ac_f, "H3K27ac", "H3K27ac", "scCUT&Tag (PBMC)"),
  make_df(totals_h_ac_f, in_peak_h_ac_me_f, "H3K27ac", "H3K27me3", "scCUT&Tag (PBMC)"),
  make_df(totals_h_me3_f, in_peak_h_me_me_f, "H3K27me3", "H3K27me3", "scCUT&Tag (PBMC)"),
  make_df(totals_h_me3_f, in_peak_h_me_ac_f, "H3K27me3", "H3K27ac", "scCUT&Tag (PBMC)"),

  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_ac_n, "H3K27ac", "H3K27ac", "scNTT-seq (PBMC 2)"),
  make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_me_n, "H3K27ac", "H3K27me3", "scNTT-seq (PBMC 2)"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_me_n, "H3K27me3", 'H3K27me3', "scNTT-seq (PBMC 2)"),
  make_df(totals_pbmc_me3_n, in_peak_pbmc_me_ac_n, "H3K27me3", "H3K27ac", "scNTT-seq (PBMC 2)")
)


bmmc_frip <- readRDS(file = "data/bmmc_dual/frip.rds")
bmmc_frip$dataset <- "scNTT-seq (BMMC)"

df3 <- rbind(bmmc_frip, df3)
df_filt <- df3[df3$assay == df3$peak, ]
df_filt <- df_filt[!is.na(df_filt$FRIP), ]

p <- ggplot(df_filt, aes(x = assay, y = FRIP, fill = dataset)) +
  geom_boxplot(outlier.size = 0.5) +
  xlab("Mark") +
  theme_bw() +
  ggtitle("Fragments in peaks")

ggsave("plots/pbmc/frip_pbmc_revision.png", plot = p, height = 4, width = 6)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
library(Seurat)
library(Signac)
library(BSgenome.Hsapiens.UCSC.hg38)
library(EnsDb.Hsapiens.v86)
library(tximport)
library(future)


args <- commandArgs(trailingOnly = TRUE)
threads <- as.numeric(args[[1]])
adt <- args[[2]]
ac_frag <- args[[3]]
me3_frag <- args[[4]]
outfile <- args[[5]]

plan("multicore", workers = as.numeric(threads))
options(future.globals.maxSize = Inf)

# process chromatin data
annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86)
seqlevelsStyle(annot) <- "UCSC"

total_me3 <- CountFragments(fragments = me3_frag)
total_ac <- CountFragments(fragments = ac_frag)

total_ac <- total_ac[total_ac$frequency_count > 100, ]
total_me3 <- total_me3[total_me3$frequency_count > 100, ]
cells_keep <- intersect(total_ac$CB, total_me3$CB)

frag_obj_me3 <- CreateFragmentObject(path = me3_frag, cells = cells_keep)
frag_obj_ac <- CreateFragmentObject(path = ac_frag, cells = cells_keep)

chr_use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:22]
counts_me3 <- AggregateTiles(
  object = frag_obj_me3,
  genome = chr_use,
  cells = cells_keep,
  min_counts = 1
)

counts_ac <- AggregateTiles(
  object = frag_obj_ac,
  genome = chr_use,
  cells = cells_keep,
  min_counts = 1
)

# remove features overlapping blacklist
remove_blacklist <- function(counts) {
  olap <- findOverlaps(query = StringToGRanges(rownames(counts)), subject = blacklist_hg38_unified)
  is_bl <- queryHits(olap)
  counts <- counts[setdiff(1:nrow(counts), is_bl), ]
  return(counts)
}

assay_me3 <- CreateChromatinAssay(
  counts = remove_blacklist(counts_me3),
  fragments = me3_frag,
  annotation = annot
)
assay_ac <- CreateChromatinAssay(
  counts = remove_blacklist(counts_ac),
  fragments = ac_frag,
  annotation = annot
)
obj <- CreateSeuratObject(counts = assay_me3, assay = "me3")
obj[['ac']] <- assay_ac

# add adt data
adt.txi <- tximport(files = adt, type = "alevin")
cells <- intersect(colnames(obj), colnames(adt.txi$counts))
obj <- obj[, cells]
obj[['ADT']] <- CreateAssayObject(counts = adt.txi$counts[, cells])

obj <- subset(obj, subset = nCount_me3 < 40000 &
                nCount_ac < 10000 &
                nCount_me3 > 300 &
                nCount_ac > 100 &
                nCount_ADT < 10000 &
                nCount_ADT > 120)

DefaultAssay(obj) <- "ADT"
obj <- NormalizeData(obj, normalization.method = "CLR", margin = 2)
igg_features <- rownames(obj)[grepl("^Ig", x = rownames(obj))]
var_feat <- setdiff(rownames(obj), igg_features)
VariableFeatures(obj) <- var_feat
obj <- ScaleData(obj)
obj <- RunPCA(obj)
obj <- FindNeighbors(obj, reduction = "pca", dims = 1:40)
obj <- FindClusters(obj)
obj <- RunUMAP(obj, reduction = 'pca', reduction.name = "umap.adt", dims = 1:40)

# remove artifact clusters
obj <- subset(obj, idents = c(4, 8), invert = TRUE)

obj <- NormalizeData(obj, normalization.method = "CLR", margin = 2)
obj <- ScaleData(obj)
obj <- RunPCA(obj)
obj <- FindNeighbors(obj, reduction = "pca", dims = 1:40)
obj <- FindClusters(obj)
obj <- RunUMAP(obj, reduction = 'pca', reduction.name = "umap.adt", dims = 1:40)

DefaultAssay(obj) <- "me3"
feat.keep <- names(which(rowSums(obj, slot = 'counts') > 1)) # remove low-count features
obj[['me3']] <- subset(obj[['me3']], features = feat.keep)
obj <- FindTopFeatures(obj)
obj <- RunTFIDF(obj)
obj <- RunSVD(obj, scale.embeddings = TRUE, reduction.name = 'lsi.me3')
obj <- RunUMAP(obj, reduction = 'lsi.me3', dims = 2:30, reduction.name = 'umap.me3')

DefaultAssay(obj) <- "ac"
feat.keep <- names(which(rowSums(obj, slot = 'counts') > 1)) # remove low-count features
obj[['ac']] <- subset(obj[['ac']], features = feat.keep)
obj <- FindTopFeatures(obj)
obj <- RunTFIDF(obj)
obj <- RunSVD(obj, scale.embeddings = TRUE, reduction.name = 'lsi.ac')
obj <- RunUMAP(obj, reduction = 'lsi.ac', dims = 2:30, reduction.name = 'umap.ac')

obj <- FindMultiModalNeighbors(
  object = obj,
  reduction.list = list("lsi.me3", "lsi.ac", "pca"),
  dims.list = list(2:30, 2:30, 1:30)
)
obj <- RunUMAP(obj, nn.name = "weighted.nn", reduction.name = "umap.wnn")
obj <- FindClusters(obj, graph.name = "wsnn", algorithm = 3, resolution = 1)

Idents(obj) <- "seurat_clusters"

obj <- RenameIdents(obj, list("0" = "CD14+ Mono",
                              "1" = "CD14+ Mono",
                              "2" = "B cell",
                              "3" = "CD14+ Mono",
                              "4" = "CD4 T cell",
                              "5" = "CD16+ Mono",
                              "6" = "Late erythroid",
                              "7" = "CD8 T cell",
                              "8" = "B cell",
                              "9" = "NK",
                              "10" = "cDC"
                             )
)
obj$celltype <- Idents(obj)

obj <- TSSEnrichment(obj, fast = FALSE, assay = "me3")
obj <- TSSEnrichment(obj, fast = FALSE, assay = "ac")

saveRDS(object = obj, file = outfile)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
library(Seurat)
library(Signac)
library(Rsamtools)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ggplot2)
library(patchwork)


args <- commandArgs(trailingOnly = TRUE)
ac_peaks <- args[[1]]
me3_peaks <- args[[2]]

colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A")

regions_ac <- read.table(ac_peaks, sep = "\t",
                         col.names = c("chromosome", "start", "end", "name", "score",
                                       "strand", "a", "b" , "c", "d"))
regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
regions_ac$peak <- "H3K27ac"

regions_me3 <- read.table(me3_peaks, sep = "\t",
                          col.names = c("chromosome", "start", "end", "name", "score",
                                        "strand", "a", "b" , "c", "d"))
regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
regions_me3$peak <- "H3K27me3"

# regions_me3 <- read.table("data/pbmc_bulk/mapped/me3_peaks.broadPeak", sep = "\t",
#                           col.names = c("chromosome", "start", "end", "name", "score",
#                                         "strand", "a", "b" , "c"))
# regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE)
# regions_me3$peak <- "H3K27me3"
# regions_me3 <- keepStandardChromosomes(regions_me3, pruning.mode = "coarse")
# regions_me3 <- dropSeqlevels(regions_me3, value = "chrM", pruning.mode = "coarse")
# 
# regions_ac <- read.table("data/pbmc_bulk/mapped/ac_peaks.narrowPeak", sep = "\t",
#                           col.names = c("chromosome", "start", "end", "name", "score",
#                                         "strand", "a", "b" , "c", "d"))
# regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE)
# regions_ac$peak <- "H3K27ac"
# regions_ac <- keepStandardChromosomes(regions_ac, pruning.mode = "coarse")
# regions_ac <- dropSeqlevels(regions_ac, value = "chrM", pruning.mode = "coarse")

regions <- c(regions_me3, regions_ac)

QuantifyRegions <- function(fragfile, regions) {
  results <- vector(mode = "numeric", length = length(regions))

  # open tabix connection
  tabix.file <- TabixFile(file = fragfile)
  open(con = tabix.file)

  # get fragments in each region
  for (j in seq_along(regions)) {
    reads <- scanTabix(file = tabix.file, param = regions[j])
    results[j] <- sum(sapply(X = reads, FUN = length))
  }

  # close connection
  close(con = tabix.file)
  cmp <- (results / sum(results)) * 10^6
  return(cmp)
}

# get filtered fragment files
# single-cell
frags_me3 <- "data/pbmc_protein/bigwig/pbmc_me3.bed.gz"
frags_ac <- "data/pbmc_protein/bigwig/pbmc_ac.bed.gz"

# bulk_cell
bulk_me3 <- "data/pbmc_bulk/mapped/mono/me3.bed.gz"
bulk_ac <- "data/pbmc_bulk/mapped/mono/ac.bed.gz"

# quantify
ac_sc_cov <- QuantifyRegions(frags_ac, regions)
me_sc_cov <- QuantifyRegions(frags_me3, regions)

me_bulk_cov <- QuantifyRegions(bulk_me3, regions)
ac_bulk_cov <- QuantifyRegions(bulk_ac, regions)

# create matrix
mat <- cbind(ac_sc_cov, me_sc_cov, ac_bulk_cov, me_bulk_cov)
mat <- as.data.frame(mat)
mat$region <- regions$peak

# randomize order
mat <- mat[sample(nrow(mat)), ]

get_cod_expression <- function(m) {
    eq <- substitute(~~italic(R)^2~"="~r2, 
     list(r2 = format(summary(m)$r.squared, digits = 2)))
    cod <- as.expression(eq)
    return(cod)
}

ac_me3_sc <- ggplot(mat, aes(x = ac_sc_cov, y = me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  ylim(c(0, 600)) + xlim(c(0, 600)) +
  xlab("") +
  ylab("") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = mat))) +
  scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) +
  geom_abline(intercept = 0, slope = 1, linetype = 2) +
  theme(legend.position = "none")

ac_sc_ac_bulk <- ggplot(mat[mat$region=="H3K27ac", ], aes(x = ac_sc_cov, y = ac_bulk_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  xlab("H3K27ac (single-cell)") +
  ylab("H3K27ac (bulk)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ ac_bulk_cov, data = mat))) +
  scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) +
  theme(legend.position = "none")

ggsave(filename = "plots/pbmc/scatterplot_pbmc.png", plot = ac_me3_sc, height = 3.5, width = 3.5)

## BMMC
frags_me3_bmmc <- "data/bmmc_dual/bulk_fragments/me3.bed.gz"
frags_ac_bmmc <- "data/bmmc_dual/bulk_fragments/ac.bed.gz"

# quantify
me_sc_cov <- QuantifyRegions(frags_me3_bmmc, regions)
ac_sc_cov <- QuantifyRegions(frags_ac_bmmc, regions)

# create matrix
mat_bmmc <- cbind(ac_sc_cov, me_sc_cov)
mat_bmmc <- as.data.frame(mat_bmmc)
mat_bmmc$region <- regions$peak

mat_bmmc <- mat_bmmc[sample(nrow(mat_bmmc)), ]

ac_me3_sc_bmmc <- ggplot(mat_bmmc, aes(x = ac_sc_cov, y = me_sc_cov, color = region)) +
  geom_point(size = 0.1) +
  theme_classic() +
  ylim(c(0, 600)) + xlim(c(0, 600)) +
  xlab("H3K27ac (multiplexed single-cell)") +
  scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) +
  ylab("H3K27me3 (multiplexed single-cell)") +
  ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = mat_bmmc))) +
  theme(legend.position = "none") +
  geom_abline(intercept = 0, slope = 1, linetype = 2)

ggsave(filename = "plots/bmmc/scatterplot_bmmc.png", plot = ac_me3_sc_bmmc, height = 4, width = 4)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
library(Signac)
library(Seurat)

obj <- readRDS("objects/pbmc_protein.rds")
Idents(obj) <- "celltype"

outdir <- "data/pbmc_protein/bigwig/"
celltypes.split <- levels(obj)

# split fragments for each assay
DefaultAssay(obj) <- "ac"
frags <- Fragments(obj)[[1]]@path
for (i in celltypes.split) {
    ct_safe <- gsub(" ", "_", i, fixed = TRUE)
    ct_safe <- gsub("+", "", ct_safe, fixed = TRUE)
    message(ct_safe)
    FilterCells(
      fragments = frags,
      cells = WhichCells(object = obj, idents = i),
      outfile = paste0(outdir, ct_safe, "_ac.bed.gz"),
      verbose = TRUE
    )
}

DefaultAssay(obj) <- "me3"
frags <- Fragments(obj)[[1]]@path
for (i in celltypes.split) {
    ct_safe <- gsub(" ", "_", i, fixed = TRUE)
    ct_safe <- gsub("+", "", ct_safe, fixed = TRUE)
    message(ct_safe)
    FilterCells(
      fragments = frags,
      cells = WhichCells(object = obj, idents = i),
      outfile = paste0(outdir, ct_safe, "_me3.bed.gz"),
      verbose = TRUE
    )
}

# remove non-cell fragments for each assay for genome bin quant
DefaultAssay(obj) <- "ac"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "pbmc_ac.bed.gz"),
  verbose = TRUE
)

DefaultAssay(obj) <- "me3"
frags <- Fragments(obj)[[1]]@path
FilterCells(
  fragments = frags,
  cells = colnames(obj),
  outfile = paste0(outdir, "pbmc_me3.bed.gz"),
  verbose = TRUE
)
21
22
23
24
25
shell:
    """
    cd data
    wget https://hgdownload-test.gi.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes
    """
33
34
35
36
37
38
shell:
    """
    cd genome/hg38
    aws s3 sync s3://stuart-genomes/hg38_analysis/bwa-mem2/ .
    aws s3 cp s3://stuart-genomes/hg38_analysis/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz .
    """
47
48
49
50
shell:
    """
    wget -i {input} -P data/ct_pro
    """
57
58
59
60
shell:
    """
    wget -i {input} -P data/mesc
    """
70
71
72
73
74
75
76
77
78
79
80
shell:
    """
    wget -i {input} -P data/henikoff
    wget https://github.com/Henikoff/scCUT-Tag/files/8647406/meta.csv -P data/henikoff
    wget https://github.com/Henikoff/scCUT-Tag/files/8653804/K27Ac_pbmc_meta.csv -P data/henikoff

    cd data/henikoff
    tabix -p bed GSM5034342_K27me3_R1_PBMC.fragments.HG38.tsv.gz
    tabix -p bed GSM5034343_K27me3_R2_PBMC.fragments.HG38.tsv.gz
    tabix -p bed GSM5034344_K27ac_PBMC.fragments.HG38.tsv.gz
    """
91
92
93
94
shell:
    """
    Rscript code/CT_pro/update_objects.R
    """
100
101
102
103
104
shell:
    """
    cd data/multict/fragments
    aws s3 sync s3://multi-cuttag/fragments/ . --request-payer
    """
SnakeMake From line 100 of master/Snakefile
109
110
111
112
113
114
115
shell:
    """
    cd data/K562_ATAC
    wget https://www.encodeproject.org/files/ENCFF006OFA/@@download/ENCFF006OFA.bigBed
    wget https://www.encodeproject.org/files/ENCFF600FDO/@@download/ENCFF600FDO.bigWig
    wget https://www.encodeproject.org/files/ENCFF558BLC/@@download/ENCFF558BLC.bed.gz
    """
SnakeMake From line 109 of master/Snakefile
122
123
124
125
126
shell:
    """
    wget -i {input} -P data/bmmc_atac
    aws s3 sync s3://mpal-hg38/public/ ./data/bmmc_atac/ --request-payer
    """
SnakeMake From line 122 of master/Snakefile
131
132
133
134
shell:
    """
    aws s3 sync s3://bmmc-reference/public objects/ --request-payer
    """
SnakeMake From line 131 of master/Snakefile
144
145
146
147
shell:
    """
    wget -i {input} -P data/encode
    """
SnakeMake From line 144 of master/Snakefile
157
158
159
160
shell:
    """
    Rscript code/encode/combine_peaks.R
    """
SnakeMake From line 157 of master/Snakefile
166
167
168
169
170
171
172
173
shell:
    """
    cd data/pbmc_atac
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_fragments.tsv.gz
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_fragments.tsv.gz.tbi
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_singlecell.csv
    wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_filtered_peak_bc_matrix.h5
    """
SnakeMake From line 166 of master/Snakefile
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.r1} \
        {input.r2} \
        | samtools sort -@ {threads} -O bam - \
        > data/HEK_K562_sc/mapped/{wildcards.mark}.bam

    samtools index data/HEK_K562_sc/mapped/{wildcards.mark}.bam

    sinto fragments \
      -b data/HEK_K562_sc/mapped/{wildcards.mark}.bam \
      -p {threads} \
      -f data/HEK_K562_sc/sinto/{wildcards.mark}.tmp \
      --barcode_regex "[^:]*"

    sort -k1,1 -k2,2n data/HEK_K562_sc/sinto/{wildcards.mark}.tmp > data/HEK_K562_sc/sinto/{wildcards.mark}.bed
    bgzip -@ {threads} data/HEK_K562_sc/sinto/{wildcards.mark}.bed
    tabix -p bed {output.frags}
    rm data/HEK_K562_sc/sinto/{wildcards.mark}.tmp
    """
217
218
219
220
221
222
223
224
225
226
shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.r1} \
        {input.r2} \
        | samtools sort -@ {threads} -O bam - \
        > data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bam
    samtools index data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bam
    """
232
233
234
235
shell:
    """
    bamCoverage -b {input} -o {output} -p {threads} --normalizeUsing BPM
    """
241
242
243
244
245
246
247
248
249
250
251
252
253
shell:
    """
    sinto fragments -p {threads} \
      -b {input} \
      --barcode_regex "[^:]*" \
      -f data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag

    sort -k1,1 -k2,2n data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag \
      > data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bed
    bgzip data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bed
    tabix -p bed {output}
    rm data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag
    """
265
266
267
268
shell:
    """
    Rscript code/HEK_K562_sc/process.R {threads} {output.obj} data/HEK_K562_sc/sinto
    """
SnakeMake From line 265 of master/Snakefile
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
shell:
    """
    # me3
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.me3_r1_mono} \
        {input.me3_r2_mono} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/mono/me3.bam
    samtools index data/pbmc_bulk/mapped/mono/me3.bam

    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.me3_r1_plex} \
        {input.me3_r2_plex} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/plex/me3.bam
    samtools index data/pbmc_bulk/mapped/plex/me3.bam

    # ac
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.ac_r1_mono} \
        {input.ac_r2_mono} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/mono/ac.bam
    samtools index data/pbmc_bulk/mapped/mono/ac.bam

    bwa-mem2 mem {input.genome} \
        -t {threads} \
        {input.ac_r1_plex} \
        {input.ac_r2_plex} \
        | samtools sort -@ {threads} -O bam - \
        > data/pbmc_bulk/mapped/plex/ac.bam
    samtools index data/pbmc_bulk/mapped/plex/ac.bam
    """
329
330
331
332
shell:
    """
    bamCoverage -b {input} -o {output} -p {threads} --normalizeUsing BPM
    """
338
339
340
341
342
343
344
345
346
347
348
349
350
shell:
    """
    sinto fragments -p {threads} \
      -b {input} \
      --barcode_regex "[^:]*" \
      -f data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag

    sort -k1,1 -k2,2n data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag \
      > data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.bed
    bgzip data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.bed
    tabix -p bed {output}
    rm data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag
    """
363
364
365
366
367
368
369
370
371
372
shell:
    """
    bedtools genomecov -i {input.me3} -g {input.chrom} -bg > data/ct_pro/H3K27me3.tmp
    sort -k1,1 -k2,2n data/ct_pro/H3K27me3.tmp > {output.me3_bg}
    bedGraphToBigWig {output.me3_bg} {input.chrom} {output.me3_bw}

    bedtools genomecov -i {input.ac} -g {input.chrom} -bg > data/ct_pro/H3K27ac.tmp
    sort -k1,1 -k2,2n data/ct_pro/H3K27ac.tmp > {output.ac_bg}
    bedGraphToBigWig {output.ac_bg} {input.chrom} {output.ac_bw}
    """
382
383
384
385
shell:
    """
    wget -i {input} -P data/k562_peaks
    """
SnakeMake From line 382 of master/Snakefile
393
394
395
396
shell:
    """
    Rscript code/HEK_K562_bulk/collect_peaks.R
    """
SnakeMake From line 393 of master/Snakefile
411
412
413
414
415
416
417
418
419
shell:
    """
    python code/demux.py \
       --read1 {input.read1} \
       --read_i5 {input.readi5} \
       --read2 {input.read2} \
       --tn5 {input.bc} \
       --output {params.outdir}
    """
SnakeMake From line 411 of master/Snakefile
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        data/{wildcards.exp}/ntt/out/{wildcards.mk}.R1.fastq \
        data/{wildcards.exp}/ntt/out/{wildcards.mk}.R2.fastq \
        | samtools sort -@ {threads} -O bam - \
        > data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam

    samtools index data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam

    sinto fragments -p {threads} \
      -b data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam \
      --barcode_regex "[^:]*" \
      -f data/{wildcards.exp}/ntt/{wildcards.mk}.frag

    sort -k1,1 -k2,2n data/{wildcards.exp}/ntt/{wildcards.mk}.frag > data/{wildcards.exp}/ntt/{wildcards.mk}.tsv

    rm data/{wildcards.exp}/ntt/{wildcards.mk}.frag
    bgzip -@ {threads} data/{wildcards.exp}/ntt/{wildcards.mk}.tsv
    tabix -p bed {output}
    """
456
457
458
459
shell:
    """
    salmon index -t {input} -i {output} --features -k7
    """
468
469
470
471
472
473
474
475
shell:
    """
    paste <(gunzip -c {input.r1}) <(gunzip -c {input.r3}) \
      | paste - - - - \
      | awk -F'\\t' -v one="data/pbmc_protein/adt/read1.fastq" '{{ OFS="\\n"; print $1,$3$4,$5,$7$8 >> one;}}'

    pigz -p {threads} data/pbmc_protein/adt/read1.fastq
    """
SnakeMake From line 468 of master/Snakefile
484
485
486
487
488
489
490
491
492
493
494
495
496
497
shell:
    """
    salmon alevin -l ISR -i {input.index} \
        -o data/pbmc_protein/adt/outs \
        -p {threads} \
        --naiveEqclass \
        --keepCBFraction 0.8 \
        -1 data/pbmc_protein/adt/TAG_S2_R2_001.fastq.gz \
        -2 {input.reads} \
        --bc-geometry 1[1-16] \
        --umi-geometry 2[1-10] \
        --read-geometry 2[71-85] \
        --tgMap data/adt_map_totalseq_a.tsv
    """
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
shell:
    """
    cd data/bmmc_dual/DNA
    cutadapt -j {threads} -u 14 -o K27ac_R2.fastq scBM_k27ac_S2_R2_001.fastq.gz
    cutadapt -j {threads} -u 14 -o K27me_R2.fastq scBM_k27me_S1_R2_001.fastq.gz

    gzip -d scBM_k27ac_S2_R1_001.fastq.gz scBM_k27ac_S2_R3_001.fastq.gz
    gzip -d scBM_k27me_S1_R1_001.fastq.gz scBM_k27me_S1_R3_001.fastq.gz        

    sinto barcode \
      --barcode_fastq K27ac_R2.fastq \
      --read1 scBM_k27ac_S2_R1_001.fastq \
      --read2 scBM_k27ac_S2_R3_001.fastq \
      -b 16

    sinto barcode \
      --barcode_fastq K27me_R2.fastq \
      --read1 scBM_k27me_S1_R1_001.fastq \
      --read2 scBM_k27me_S1_R3_001.fastq \
      -b 16

    pigz -p {threads} *.fastq
    mkdir barcoded
    mv scBM_k27me_S1_R1_001.barcoded.fastq.gz scBM_k27me_S1_R3_001.barcoded.fastq.gz ./barcoded/
    mv scBM_k27ac_S2_R1_001.barcoded.fastq.gz scBM_k27ac_S2_R3_001.barcoded.fastq.gz ./barcoded/
    """
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
shell:
    """
    bwa-mem2 mem {input.genome} \
        -t {threads} \
        data/bmmc_dual/DNA/barcoded/scBM_k27me_S1_R1_001.barcoded.fastq.gz \
        data/bmmc_dual/DNA/barcoded/scBM_k27me_S1_R3_001.barcoded.fastq.gz \
        | samtools sort -@ {threads} -O bam - \
        > data/bmmc_dual/DNA/H3K27me3.bam

    bwa-mem2 mem {input.genome} \
        -t {threads} \
        data/bmmc_dual/DNA/barcoded/scBM_k27ac_S2_R1_001.barcoded.fastq.gz \
        data/bmmc_dual/DNA/barcoded/scBM_k27ac_S2_R3_001.barcoded.fastq.gz \
        | samtools sort -@ {threads} -O bam - \
        > data/bmmc_dual/DNA/H3K27ac.bam

    samtools index data/bmmc_dual/DNA/H3K27ac.bam
    samtools index data/bmmc_dual/DNA/H3K27me3.bam

    sinto fragments -p {threads} \
      -b data/bmmc_dual/DNA/H3K27ac.bam \
      --barcode_regex "[^:]*" \
      -f data/bmmc_dual/DNA/H3K27ac.frag

    sinto fragments -p {threads} \
      -b data/bmmc_dual/DNA/H3K27me3.bam \
      --barcode_regex "[^:]*" \
      -f data/bmmc_dual/DNA/H3K27me3.frag

    sort -k1,1 -k2,2n data/bmmc_dual/DNA/H3K27ac.frag > data/bmmc_dual/DNA/H3K27ac.bed
    sort -k1,1 -k2,2n data/bmmc_dual/DNA/H3K27me3.frag > data/bmmc_dual/DNA/H3K27me3.bed

    rm data/bmmc_dual/DNA/H3K27ac.frag data/bmmc_dual/DNA/H3K27me3.frag
    bgzip -@ {threads} data/bmmc_dual/DNA/H3K27ac.bed
    bgzip -@ {threads} data/bmmc_dual/DNA/H3K27me3.bed
    tabix -p bed {output.ac}
    tabix -p bed {output.me}
    """
584
585
586
587
shell:
    """
    Rscript code/bmmc_atac/process_bmmc_atac.r
    """
SnakeMake From line 584 of master/Snakefile
598
599
600
601
shell:
    """
    Rscript code/henikoff/process.R
    """
SnakeMake From line 598 of master/Snakefile
616
617
618
619
shell:
    """
    Rscript code/HEK_K562_bulk/analysis.R
    """
SnakeMake From line 616 of master/Snakefile
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
shell:
    """
    gzip -dc {input.me3_peaks} > data/k562_peaks/ENCFF031FSF.bed
    gzip -dc {input.ac_peaks} > data/k562_peaks/ENCFF038DDS.bed

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \
      {input.mono_ac} {input.mono_me3} {input.mono_rna} \
      -R data/k562_peaks/ENCFF049HUP.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/HEK_K562_bulk/h3k27me3.mat.gz

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \
      {input.mono_ac} {input.mono_me3} {input.mono_rna} \
      -R data/k562_peaks/ENCFF038DDS.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/HEK_K562_bulk/h3k27ac.mat.gz

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \
      {input.mono_ac} {input.mono_me3} {input.mono_rna} \
      -R {input.rna_peaks} \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/HEK_K562_bulk/rna.mat.gz

    plotHeatmap \
      -m data/HEK_K562_bulk/h3k27me3.mat.gz  \
      -o {output.me3} \
      --whatToShow 'heatmap and colorbar' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767'

    plotHeatmap \
      -m data/HEK_K562_bulk/h3k27ac.mat.gz  \
      -o {output.ac} \
      --zMax 0.5 \
      --whatToShow 'heatmap and colorbar' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767'

    plotHeatmap \
      -m data/HEK_K562_bulk/rna.mat.gz  \
      -o {output.rna} \
      --zMax 0.5 \
      --whatToShow 'heatmap and colorbar' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767'
    """
697
698
699
700
shell:
    """
    Rscript code/HEK_K562_sc/analysis.R
    """
SnakeMake From line 697 of master/Snakefile
706
707
708
709
shell:
    """
    Rscript code/HEK/K562_bulk/quantify_regions.R
    """
SnakeMake From line 706 of master/Snakefile
719
720
721
722
shell:
    """
    Rscript code/HEK_K562_sc/mct_scatter.R
    """
SnakeMake From line 719 of master/Snakefile
730
731
732
733
shell:
    """
    Rscript code/pbmc_atac/process.R
    """
SnakeMake From line 730 of master/Snakefile
742
743
744
745
shell:
    """
    Rscript code/pbmc_protein/process.R {threads} {input.adt} {input.ac} {input.me} {output}
    """
SnakeMake From line 742 of master/Snakefile
753
754
755
756
shell:
    """
    Rscript code/pbmc_sc/process.R {threads} {input.ac} {input.me} {output}
    """
SnakeMake From line 753 of master/Snakefile
770
771
772
773
774
775
776
777
778
779
780
781
782
shell:
    """
    # split fragment files
    Rscript code/pbmc_protein/split.R

    # create bigwig
    cd data/pbmc_protein/bigwig
    for fragfile in $(ls -d *.bed.gz);do
      fname=(${{fragfile//.bed.gz/ }})
      bedtools genomecov -i $fragfile -g ../../../{input.chrom} -bg > "${{fname}}.bg"
      bedGraphToBigWig "${{fname}}.bg" ../../../{input.chrom} "${{fname}}.bw"
    done
    """
795
796
797
798
799
800
801
802
803
804
805
806
807
shell:
    """
    # split fragment files
    Rscript code/pbmc_sc/split.R

    # create bigwig
    cd data/pbmc_sc/ntt
    for fragfile in $(ls -d *.bed.gz);do
      fname=(${{fragfile//.bed.gz/ }})
      bedtools genomecov -i $fragfile -g ../../../{input.chrom} -bg > "${{fname}}.bg"
      bedGraphToBigWig "${{fname}}.bg" ../../../{input.chrom} "${{fname}}.bw"
    done
    """
814
815
816
817
shell:
    """
    Rscript code/bmmc_dual/split.R
    """
SnakeMake From line 814 of master/Snakefile
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
shell:
    """
    multiBigwigSummary BED-file -b \
      {input.me3_sc} \
      {input.ac_sc} \
      {input.me3_bulk_mono} \
      {input.me3_bulk_plex} \
      {input.ac_bulk_mono} \
      {input.ac_bulk_plex} \
      {input.me3_chip} \
      {input.ac_chip} \
      {input.ctp_ac_bw} \
      {input.ctp_me3_bw} \
      --BED {input.all_peaks} \
      -p {threads} \
      --outRawCounts {output} \
      -o data/pbmc_bulk/cor_matrix_all.npz
    """
SnakeMake From line 834 of master/Snakefile
862
863
864
865
866
867
868
869
870
871
872
873
shell:
    """
    multiBigwigSummary BED-file -b \
      {input.me3_1} \
      {input.ac_1} \
      {input.me3_2} \
      {input.ac_2} \
      --BED {input.all_peaks} \
      -p {threads} \
      --outRawCounts {output} \
      -o data/pbmc_protein/replicate_cor_matrix_all.npz
    """
SnakeMake From line 862 of master/Snakefile
879
880
881
882
shell:
    """
    Rscript code/pbmc_bulk/encode_cor_all.R
    """
SnakeMake From line 879 of master/Snakefile
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
shell:
    """
    # ac cor
    multiBigwigSummary BED-file -b \
      data/pbmc_protein/bigwig/B_cell_ac.bw \
      data/pbmc_protein/bigwig/NK_ac.bw \
      data/pbmc_protein/bigwig/CD14_Mono_ac.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_ac.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_ac.bw \
      data/pbmc_protein/bigwig/Late_erythroid_ac.bw \
      data/encode/ENCFF293ETP.bigWig \
      data/encode/ENCFF611GRL.bigWig \
      data/encode/ENCFF181OXO.bigWig \
      data/encode/ENCFF526VJO.bigWig \
      data/encode/ENCFF601NLG.bigWig \
      data/encode/ENCFF951ZBV.bigWig \
      data/encode/ENCFF064JOI.bigWig \
      --BED {input.ac} \
      -p {threads} \
      --outRawCounts data/pbmc_protein/bigwig/raw_ac.tsv \
      -o data/pbmc_protein/bigwig/cor_matrix_ac.npz

    # me3 cor
    multiBigwigSummary BED-file -b \
      data/pbmc_protein/bigwig/B_cell_me3.bw \
      data/pbmc_protein/bigwig/NK_me3.bw \
      data/pbmc_protein/bigwig/CD14_Mono_me3.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_me3.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_me3.bw \
      data/pbmc_protein/bigwig/Late_erythroid_me3.bw \
      data/encode/ENCFF569IPX.bigWig \
      data/encode/ENCFF842JLZ.bigWig \
      data/encode/ENCFF811VAX.bigWig \
      data/encode/ENCFF499VWN.bigWig \
      data/encode/ENCFF777EGG.bigWig \
      data/encode/ENCFF046VLL.bigWig \
      data/encode/ENCFF085YMB.bigWig \
      --BED {input.me3} \
      -p {threads} \
      --outRawCounts data/pbmc_protein/bigwig/raw_me3.tsv \
      -o data/pbmc_protein/bigwig/cor_matrix_me3.npz

    # all cor
    multiBigwigSummary BED-file -b \
      data/pbmc_protein/bigwig/B_cell_me3.bw \
      data/pbmc_protein/bigwig/NK_me3.bw \
      data/pbmc_protein/bigwig/CD14_Mono_me3.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_me3.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_me3.bw \
      data/pbmc_protein/bigwig/Late_erythroid_me3.bw \
      data/pbmc_protein/bigwig/B_cell_ac.bw \
      data/pbmc_protein/bigwig/NK_ac.bw \
      data/pbmc_protein/bigwig/CD14_Mono_ac.bw \
      data/pbmc_protein/bigwig/CD8_T_cell_ac.bw \
      data/pbmc_protein/bigwig/CD4_T_cell_ac.bw \
      data/pbmc_protein/bigwig/Late_erythroid_ac.bw \
      data/encode/ENCFF569IPX.bigWig \
      data/encode/ENCFF842JLZ.bigWig \
      data/encode/ENCFF811VAX.bigWig \
      data/encode/ENCFF499VWN.bigWig \
      data/encode/ENCFF777EGG.bigWig \
      data/encode/ENCFF046VLL.bigWig \
      data/encode/ENCFF085YMB.bigWig \
      data/encode/ENCFF293ETP.bigWig \
      data/encode/ENCFF611GRL.bigWig \
      data/encode/ENCFF181OXO.bigWig \
      data/encode/ENCFF526VJO.bigWig \
      data/encode/ENCFF601NLG.bigWig \
      data/encode/ENCFF951ZBV.bigWig \
      data/encode/ENCFF064JOI.bigWig \
      --BED data/encode/all.bed \
      -p {threads} \
      --outRawCounts data/pbmc_protein/bigwig/raw.tsv \
      -o data/pbmc_protein/bigwig/cor_matrix_all.npz
    """
SnakeMake From line 896 of master/Snakefile
982
983
984
985
shell:
    """
    Rscript code/pbmc_protein/encode_cor.R
    """
SnakeMake From line 982 of master/Snakefile
994
995
996
997
shell:
    """
    Rscript code/pbmc_bulk/analysis.R
    """
SnakeMake From line 994 of master/Snakefile
1008
1009
1010
1011
shell:
    """
    Rscript code/pbmc_bulk/frip.R
    """
SnakeMake From line 1008 of master/Snakefile
1024
1025
1026
1027
shell:
    """
    Rscript code/pbmc_protein/frip.R
    """
SnakeMake From line 1024 of master/Snakefile
1035
1036
1037
1038
shell:
    """
    Rscript code/bmmc_dual/frip_bmmc.R
    """
SnakeMake From line 1035 of master/Snakefile
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
shell:
    """
    gzip -dc {input.me3_peaks} > data/encode/ENCFF291LVP.bed
    gzip -dc {input.ac_peaks} > data/encode/ENCFF832RWT.bed

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.mono_ac} {input.mono_me3} \
      -R data/encode/ENCFF291LVP.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/pbmc_bulk/h3k27me3.mat.gz

    computeMatrix reference-point \
      --referencePoint 'center' \
      --missingDataAsZero \
      -S {input.plex_ac} {input.plex_me3} {input.mono_ac} {input.mono_me3} \
      -R data/encode/ENCFF832RWT.bed \
      --upstream 5000 --downstream 5000 \
      -p {threads} \
      -o data/pbmc_bulk/h3k27ac.mat.gz

    plotHeatmap \
      -m data/pbmc_bulk/h3k27me3.mat.gz \
      -o {output.me3} \
      --whatToShow 'heatmap only' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#676767' '#FFFFFF,#676767'

    plotHeatmap \
      -m data/pbmc_bulk/h3k27ac.mat.gz \
      -o {output.ac} \
      --whatToShow 'heatmap only' \
      --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#676767' '#FFFFFF,#676767'
    """
1100
1101
1102
1103
shell:
    """
    Rscript code/pbmc_protein/scatterplots.R {input.ac_peaks} {input.me3_peaks}
    """
SnakeMake From line 1100 of master/Snakefile
1115
1116
1117
1118
shell:
    """
    Rscript code/pbmc_sc/scatterplots_sc.R {input.ac_peaks} {input.me3_peaks}
    """
SnakeMake From line 1115 of master/Snakefile
1126
1127
1128
1129
shell:
    """
    Rscript code/bmmc_dual/process.R {threads} {output}
    """
SnakeMake From line 1126 of master/Snakefile
1139
1140
1141
1142
shell:
    """
    Rscript code/bmmc_dual/trajectory.R
    """
SnakeMake From line 1139 of master/Snakefile
1147
1148
1149
1150
shell:
    """
    Rscript code/bmmc_dual/transfer.R
    """
SnakeMake From line 1147 of master/Snakefile
1155
1156
1157
1158
shell:
    """
    Rscript code/bmmc_dual/analysis.R
    """
SnakeMake From line 1155 of master/Snakefile
1169
1170
1171
1172
shell:
    """
    Rscript code/pbmc_protein/analysis.R {input.ntt} {input.ct_ac} {input.ct_me3} {input.ntt_2}
    """
SnakeMake From line 1169 of master/Snakefile
1184
1185
1186
1187
shell:
    """
    Rscript code/pbmc_bulk/quantify_regions.R
    """
SnakeMake From line 1184 of master/Snakefile
ShowHide 73 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://doi.org/10.1038/s41587-022-01588-5
Name: nanobody
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...