Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
This repository contains code needed to reproduce the analyses shown in Stuart et al. (2022), Nature Biotechnology
More information about NTT-seq can be found at the NTT-seq website
Installing dependencies
All the required dependencies needed to run the workflow can be installed automatically by creating a new conda environment.
First ensure that conda or mamba is installed and available.
To create a new environment with the dependencies installed, run:
# using conda
conda env create -f environment.yaml
# using mamba
mamba env create -f environment.yaml
Running the workflow
This workflow involves downloading data from AWS using the AWS
command line tools. To enable the download, you will first need
to create an AWS account and set up the AWS command line tools by
running
aws configure
.
Note that some of the data downloaded
may incur charges from AWS
.
To run the Snakemake workflow, first activate the conda environment containing the required dependencies:
conda activate ntt
Next, run
snakemake
with the desired options. Setting the
-j
parameter
controls the maximum number of cores used by the workflow:
snakemake -j 24
See the snakemake documentation for a complete list of available options.
Data availability
Processed datasets from this study are available from:
-
GEO: GSE212588
-
SRA: SRP395379
-
Zenodo: 7102159
-
dbGaP: phs003068.v1.p1
Plasmid availability
Plasmids generated from this study are available from AddGene :
Citation
@ARTICLE{Stuart2022,
title = "Nanobody-tethered transposition enables multifactorial chromatin
profiling at single-cell resolution",
author = "Stuart, Tim and Hao, Stephanie and Zhang, Bingjie and
Mekerishvili, Levan and Landau, Dan A and Maniatis, Silas and
Satija, Rahul and Raimondi, Ivan",
journal = "Nat. Biotechnol.",
month = dec,
year = 2022,
url = "http://dx.doi.org/10.1038/s41587-022-01588-5",
language = "en"
}
Code Snippets
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | library(Signac) library(Seurat) library(SummarizedExperiment) library(future) library(GenomicRanges) library(EnsDb.Hsapiens.v86) plan("multicore", workers = 8) options(future.globals.maxSize = 100 * 1024 ^ 3) ### Load processed object for the metadata orig.obj <- readRDS("data/bmmc_atac/scATAC-Healthy-Hematopoiesis-191120.rds") annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86) seqlevelsStyle(annot) <- "UCSC" # get all fragment files frag.files <- list.files("data/bmmc_atac/", pattern = "^scATAC_", full.names = TRUE) frag.files <- paste0(frag.files, "/fragments.tsv.gz") names(frag.files) <- list.files("data/bmmc_atac/", pattern = "^scATAC_") # call peaks on each and combine pk.all <- list() for (i in seq_along(frag.files)) { pks <- CallPeaks( object = frag.files[[i]] ) pk.all[[i]] <- pks } gr.combined <- Reduce(f = c, x = pk.all) gr.combined <- GenomicRanges::reduce(gr.combined) gr.combined <- keepStandardChromosomes(gr.combined, pruning.mode = "coarse") # quantify and create objects md <- colData(orig.obj) obj.list <- list() for (i in seq_along(frag.files)) { obj_ident <- gsub(pattern = "scATAC_", x = names(frag.files)[[i]], replacement = "") cells.keep <- md[md$Group == obj_ident, "Barcode"] frags <- CreateFragmentObject( path = frag.files[[i]], cells = cells.keep ) peakcounts <- FeatureMatrix( fragments = frags, features = gr.combined, cells = cells.keep ) obj <- CreateChromatinAssay(counts = peakcounts, fragments = frags) obj <- CreateSeuratObject(counts = obj, assay = "ATAC") obj <- RenameCells(obj, add.cell.id = obj_ident) obj.list[[i]] <- obj } rownames(md) <- paste0(md$Group, "_", md$Barcode) md <- as.data.frame(md) bmmc <- merge(x = obj.list[[1]], y = obj.list[2:length(obj.list)]) bmmc <- AddMetaData(object = bmmc, metadata = md) Annotation(bmmc) <- annot bmmc$tissue <- sapply(strsplit(bmmc$Group, "_"), `[[`, 1) bmmc$sample <- sapply(strsplit(bmmc$Group, "_"), `[[`, 2) # remove PBMC Idents(bmmc) <- "tissue" bmmc <- subset(bmmc, idents = "PBMC", invert = TRUE) bmmc <- NucleosomeSignal(bmmc) # process bmmc <- FindTopFeatures(bmmc) bmmc <- RunTFIDF(bmmc) bmmc <- RunSVD(bmmc) bmmc <- RunUMAP(bmmc, reduction = "lsi", dims = 2:40) ga <- GeneActivity(bmmc) bmmc[["GA"]] <- CreateAssayObject(counts = ga) DefaultAssay(bmmc) <- "GA" bmmc <- NormalizeData(bmmc) DefaultAssay(bmmc) <- "ATAC" # integrate embeddings atac <- SplitObject(bmmc, split.by = "Group") atac <- lapply(atac, RunSVD, scale.embeddings = FALSE) atac_anchors <- FindIntegrationAnchors( object.list = atac, reduction = "rlsi", dims = 2:30 ) atac_integrated <- IntegrateEmbeddings( anchorset = atac_anchors, new.reduction.name = "integrated_lsi", reductions = bmmc[["lsi"]], dims.to.integrate = 1:30 ) bmmc[["integrated_lsi"]] <- atac_integrated[["integrated_lsi"]] saveRDS(object = bmmc, file = "objects/bmmc_atac.rds") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | library(Seurat) library(Signac) library(ggplot2) library(patchwork) bmmc <- readRDS("objects/bmmc_dual.rds") colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A") p1 <- DimPlot(bmmc, reduction = "umap.me3", group.by = "celltype", pt.size = 0.1, label = TRUE, label.size = 3, repel = TRUE) + theme_void() + theme(text = element_text(size = 10), legend.position = "none") + ggtitle("H3K27me3") p2 <- DimPlot(bmmc, reduction = "umap.ac", group.by = "celltype", pt.size = 0.1, label = TRUE, label.size = 3, repel = TRUE) + theme_void() + theme(text = element_text(size = 10), legend.position = "none") + ggtitle("H3K27ac") + NoLegend() p3 <- DimPlot(bmmc, reduction = "umap.wnn", group.by = "celltype", label = TRUE, label.size = 4, repel = TRUE) + ggtitle("BMMCs: H3K27me3 + H3K27ac") + theme_classic() + NoLegend() + ylab('UMAP 2') + xlab("UMAP 1") pp <- ((p1 / p2) | p3) + plot_layout(widths = c(1, 2)) ggsave(filename = "plots/bmmc/dimplots_bmmc.png", plot = pp, height = 6, width = 8) # fragment counts fm <- CountFragments( fragments = Fragments(bmmc[["me3"]])[[1]]@path, cells = colnames(bmmc) ) fa <- CountFragments( fragments = Fragments(bmmc[["ac"]])[[1]]@path, cells = colnames(bmmc) ) mean(fm$frequency_count) # 1217 mean(fa$frequency_count) # 326 sd(fm$frequency_count) # 1274 sd(fa$frequency_count) # 334 # create plot fm$mark <- "H3K27me3" fa$mark <- "H3K27ac" all_counts <- rbind(fm, fa) p0 <- ggplot(all_counts, aes(mark, frequency_count, fill = mark)) + geom_violin(size = 0.2) + scale_y_log10() + theme_bw() + ylab("Fragments") + xlab("Mark") + ggtitle("Total fragments") + theme(legend.position = "none") + scale_fill_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) ggsave(filename = "plots/bmmc/fragments_bmmc.png", plot = p0, height = 3, width = 3) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 | library(Signac) library(Seurat) library(GenomicRanges) library(ggplot2) bmmc <- readRDS("objects/bmmc_dual.rds") # ENCODE peaks pk_ac <- read.table("data/encode/ENCFF832RWT.bed.gz") pk_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz") pk_ac <- makeGRangesFromDataFrame(df = pk_ac, seqnames.field = "V1", start.field = "V2", end.field = "V3") pk_me3 <- makeGRangesFromDataFrame(df = pk_me3, seqnames.field = "V1", start.field = "V2", end.field = "V3") t_bmmc_ac <- CountFragments(fragments = Fragments(bmmc[['ac']])[[1]]@path, cells = colnames(bmmc)) totals_bmmc_ac <- t_bmmc_ac$frequency_count names(totals_bmmc_ac) <- t_bmmc_ac$CB t_bmmc_me3 <- CountFragments(fragments = Fragments(bmmc[['me3']])[[1]]@path, cells = colnames(bmmc)) totals_bmmc_me3 <- t_bmmc_me3$frequency_count names(totals_bmmc_me3) <- t_bmmc_me3$CB # total counts in peak regions get_total <- function(obj, regions) { counts <- FeatureMatrix( fragments = Fragments(obj), features = regions, cells = colnames(obj) ) return(colSums(counts)) } # ntt in_peak_bmmc_ac_ac <- get_total(obj = bmmc[['ac']], regions = pk_ac) in_peak_bmmc_ac_me <- get_total(obj = bmmc[['ac']], regions = pk_me3) in_peak_bmmc_me_me <- get_total(obj = bmmc[['me3']], regions = pk_me3) in_peak_bmmc_me_ac <- get_total(obj = bmmc[['me3']], regions = pk_ac) # frip make_df <- function(totals, inpeak, assay, pk, dataset) { d <- data.frame( cell = names(totals), total = totals, in_peak = inpeak[names(totals)], assay = assay, peak = pk, dataset = dataset ) d$FRIP <- d$in_peak / d$total return(d) } df <- rbind( make_df(totals_bmmc_ac, in_peak_bmmc_ac_ac, "H3K27ac", "H3K27ac", "NTT"), make_df(totals_bmmc_ac, in_peak_bmmc_ac_me, "H3K27ac", "H3K27me3", "NTT"), make_df(totals_bmmc_me3, in_peak_bmmc_me_me, "H3K27me3", 'H3K27me3', "NTT"), make_df(totals_bmmc_me3, in_peak_bmmc_me_ac, "H3K27me3", "H3K27ac", "NTT") ) df_filt <- df[df$assay == df$peak, ] p <- ggplot(df_filt, aes(x = peak, y = FRIP, fill = peak)) + geom_boxplot(outlier.size = 0.2) + scale_fill_manual(values = c("#F98401", "#D3145A")) + theme_bw() + ylab("FRiP") + xlab("Assay") + ggtitle("FRiP") + theme(legend.position = "none") ggsave("plots/bmmc/frip_bmmc.png", plot = p, height = 4, width = 3) saveRDS(object = df_filt, file = "data/bmmc_dual/frip.rds") mean(df_filt[df_filt$peak == "H3K27me3", "FRIP" ]) # 0.18 sd(df_filt[df_filt$peak == "H3K27me3", "FRIP" ]) # 0.09 mean(df_filt[df_filt$peak == "H3K27ac", "FRIP" ]) # 0.26 sd(df_filt[df_filt$peak == "H3K27ac", "FRIP" ]) # 0.09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | library(Seurat) library(Signac) library(EnsDb.Hsapiens.v86) library(BSgenome.Hsapiens.UCSC.hg38) library(future) library(GenomicRanges) args <- commandArgs(trailingOnly = TRUE) threads <- args[[1]] outfile <- args[[2]] plan("multicore", workers = as.integer(threads)) options(future.globals.maxSize = Inf) # annotation annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86) seqlevelsStyle(annot) <- "UCSC" genome.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:23] fpath <- "data/bmmc_dual/DNA" fragfiles <- list.files( path = fpath, pattern = "*.bed.gz$", full.names = TRUE ) fname <- gsub(pattern = paste0(fpath, "/"), replacement = "", x = fragfiles) fname <- gsub(pattern = ".bed.gz", replacement = "", x = fname) names(fragfiles) <- fname total_counts <- CountFragments(fragments = as.list(fragfiles)) cells.keep <- total_counts[total_counts$frequency_count > 150, "CB"] frags_me3 <- CreateFragmentObject( path = fragfiles[['H3K27me3']], cells = cells.keep ) frags_ac <- CreateFragmentObject( path = fragfiles[['H3K27ac']], cells = cells.keep ) counts_me3 <- AggregateTiles( object = frags_me3, cells = cells.keep, min_counts = 1, binsize = 5000, genome = genome.use ) counts_ac <- AggregateTiles( object = frags_ac, cells = cells.keep, min_counts = 3, binsize = 1000, genome = genome.use ) # remove features overlapping blacklist remove_blacklist <- function(counts) { olap <- findOverlaps(query = StringToGRanges(rownames(counts)), subject = blacklist_hg38_unified) is_bl <- queryHits(olap) counts <- counts[setdiff(1:nrow(counts), is_bl), ] return(counts) } # create seurat object me3 <- CreateChromatinAssay( counts = remove_blacklist(counts_me3), fragments = fragfiles[['H3K27me3']], annotation = annot ) ac <- CreateChromatinAssay( counts = remove_blacklist(counts_ac), fragments = fragfiles[['H3K27ac']], annotation = annot ) bmmc <- CreateSeuratObject(counts = me3, assay = "me3") bmmc[['ac']] <- ac bmmc <- subset(bmmc, subset = nCount_me3 < 10000 & nCount_ac < 10000 & nCount_me3 > 100 & nCount_ac > 75) bmmc <- TSSEnrichment(bmmc, assay = "me3") bmmc$tss.me3 <- bmmc$TSS.enrichment bmmc <- TSSEnrichment(bmmc, assay = "ac") bmmc$tss.ac <- bmmc$TSS.enrichment bmmc <- NucleosomeSignal(bmmc, assay = "me3") bmmc$nucleosome.me3 <- bmmc$nucleosome_signal bmmc <- NucleosomeSignal(bmmc, assay = "ac") bmmc$nucleosome.ac <- bmmc$nucleosome_signal bmmc <- subset(bmmc, subset = tss.ac > 1) DefaultAssay(bmmc) <- "me3" feat.keep <- names(which(rowSums(bmmc, slot = 'counts') > 1)) # remove low-count features bmmc[['me3']] <- subset(bmmc[['me3']], features = feat.keep) bmmc <- FindTopFeatures(bmmc) bmmc <- RunTFIDF(bmmc, scale.factor = median(bmmc$nCount_me3)) bmmc <- RunSVD(bmmc, reduction.name = "lsi.me3", n = 100, scale.embeddings = FALSE) bmmc <- RunUMAP(bmmc, reduction = "lsi.me3", reduction.name = "umap.me3", dims = 2:100) DefaultAssay(bmmc) <- "ac" feat.keep <- names(which(rowSums(bmmc, slot = 'counts') > 1)) bmmc[['ac']] <- subset(bmmc[['ac']], features = feat.keep) bmmc <- FindTopFeatures(bmmc) bmmc <- RunTFIDF(bmmc, scale.factor = median(bmmc$nCount_ac)) bmmc <- RunSVD(bmmc, reduction.name = "lsi.ac", n = 100, scale.embeddings = FALSE) bmmc <- RunUMAP(bmmc, reduction = "lsi.ac", reduction.name = "umap.ac", dims = 2:60) bmmc <- FindMultiModalNeighbors( object = bmmc, reduction.list = list("lsi.me3", "lsi.ac"), dims.list = list(2:100, 2:60) ) bmmc <- RunUMAP(bmmc, nn.name = "weighted.nn", reduction.name = "umap.wnn") bmmc <- FindClusters(bmmc, graph.name = "wsnn", algorithm = 3, resolution = 4) renaming <- list( '0' = "NK", '1' = "NK", '2' = "GMP/CLP", '3' = 'CD8 Naive', '4' = 'CD4 Naive', '5' = 'CD14 Mono', '6' = 'pDC', '7' = 'B', '8' = 'CD4 Memory', '9' = 'CD8 Memory', '10' = 'CD14 Mono', '11' = 'Pre-B', '12' = 'CD8 Memory', '13' = 'Late erythroid', '14' = 'Early erythroid', '15' = 'CD14 Mono', '16' = 'B', '17' = 'HSPC', '18' = 'CD14 Mono', '19' = 'CD14 Mono', '20' = 'Plasma', '21' = 'Early basophil' ) Idents(bmmc) <- "seurat_clusters" bmmc <- RenameIdents(bmmc, renaming) bmmc$celltype <- Idents(bmmc) saveRDS(object = bmmc, file = "objects/bmmc_dual.rds") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 | library(Signac) library(Seurat) obj <- readRDS("objects/bmmc_dual.rds") outdir <- "data/bmmc_dual/bulk_fragments/" # remove non-cell fragments for each assay for genome bin quant DefaultAssay(obj) <- "ac" frags <- Fragments(obj)[[1]]@path FilterCells( fragments = frags, cells = colnames(obj), outfile = paste0(outdir, "ac.bed.gz"), verbose = TRUE ) DefaultAssay(obj) <- "me3" frags <- Fragments(obj)[[1]]@path FilterCells( fragments = frags, cells = colnames(obj), outfile = paste0(outdir, "me3.bed.gz"), verbose = TRUE ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 | library(Signac) library(Seurat) library(ggplot2) library(Matrix) library(BSgenome.Hsapiens.UCSC.hg38) library(GenomeInfoDb) library(RColorBrewer) library(RcppRoll) if (!requireNamespace("SeuratWrappers", quietly = TRUE)) { remotes::install_github('satijalab/seurat-wrappers') } library(SeuratWrappers) if (!requireNamespace("monocle3", quietly = TRUE)) { setRepositories(ind=1:2) remotes::install_github('cole-trapnell-lab/leidenbase') remotes::install_github('cole-trapnell-lab/monocle3') } library(monocle3) library(future) plan("multicore", workers = 12) options(future.globals.maxSize = Inf) bmmc <- readRDS("objects/bmmc_dual.rds") Idents(bmmc) <- "celltype" cells.use <- c("HSPC", "GMP/CLP", "Pre-B", "B", "Plasma") bcell <- bmmc[, bmmc$celltype %in% cells.use] DefaultAssay(bcell) <- "me3" feat.keep <- names(which(rowSums(bcell, slot = 'counts') > 1)) # remove low-count features bcell[['me3']] <- subset(bcell[['me3']], features = feat.keep) bcell <- FindTopFeatures(bcell, min.cutoff = 2) bcell <- RunTFIDF(bcell, scale.factor = median(bcell$nCount_me3)) bcell <- RunSVD(bcell, reduction.name = "lsi.me3", scale.embeddings = FALSE) bcell <- RunUMAP(bcell, reduction = "lsi.me3", reduction.name = 'umap.me3', dims = 2:20) DefaultAssay(bcell) <- "ac" feat.keep <- names(which(rowSums(bcell, slot = 'counts') > 1)) # remove low-count features bcell[['ac']] <- subset(bcell[['ac']], features = feat.keep) bcell <- FindTopFeatures(bcell, min.cutoff = 2) bcell <- RunTFIDF(bcell, scale.factor = median(bcell$nCount_ac)) bcell <- RunSVD(bcell, reduction.name = "lsi.ac", scale.embeddings = FALSE) bcell <- RunUMAP(bcell, reduction = "lsi.ac", reduction.name = 'umap.ac', dims = 2:20) # WNN bcell <- FindMultiModalNeighbors( object = bcell, reduction.list = list('lsi.me3', 'lsi.ac'), dims.list = list(2:20, 2:10) ) bcell <- RunUMAP(bcell, nn.name = "weighted.nn", reduction.name = "umap.wnn") ## run monocle3 bcell@reductions$umap <- bcell@reductions$umap.wnn # needs to be named UMAP for monocle bcell.cds <- as.cell_data_set(bcell) bcell.cds <- cluster_cells(cds = bcell.cds, reduction_method = "UMAP") bcell.cds <- learn_graph(bcell.cds, use_partition = FALSE) root_cells <- WhichCells(bcell, idents = "HSPC") bcell.cds <- order_cells(bcell.cds, reduction_method = "UMAP", root_cells = tail(root_cells, 10)) bcell <- AddMetaData( object = bcell, metadata = bcell.cds@principal_graph_aux@listData$UMAP$pseudotime, col.name = "B.cell.pseudotime" ) fp1 <- FeaturePlot(bcell, "B.cell.pseudotime", reduction = "umap.wnn") + scale_color_viridis_c(option = "C") + theme_void() + ggtitle("") + theme(legend.position = "none") ggsave("plots/bmmc/trajectory.png", plot = fp1, height = 4, width = 4) ## find features correlated with pseudotime chr.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:22] bins_me3 <- GenomeBinMatrix( fragments = Fragments(bcell[['me3']]), genome = chr.use, binsize = 10000, cells = colnames(bcell) ) saveRDS(bins_me3, "bins_me3_bcell.rds") bins_me3 <- readRDS("bins_me3_bcell.rds") bins_ac <- GenomeBinMatrix( fragments = Fragments(bcell[['ac']]), genome = chr.use, binsize = 10000, cells = colnames(bcell) ) saveRDS(bins_ac, "bins_ac_bcell.rds") bins_ac <- readRDS("bins_ac_bcell.rds") DefaultAssay(bcell) <- "me3" bcell <- FindNeighbors(bcell, reduction = "lsi.me3", dims = 2:20, k = 50) knn <- bcell[['me3_nn']] knn_smooth <- function(counts, knn) { knn_norm <- knn / rowSums(knn) smoothed <- Matrix::tcrossprod(counts, knn_norm) return(smoothed) } smoothed_me3 <- knn_smooth(counts = bins_me3, knn = knn) smoothed_ac <- knn_smooth(counts = bins_ac, knn = knn) # filter regions based on total counts keep_region <- (rowSums(smoothed_ac) > 5) & (rowSums(smoothed_me3) > 5) smoothed_me3 <- smoothed_me3[keep_region, ] smoothed_ac <- smoothed_ac[keep_region, ] # depth normalize smoothed_me3 <- t(t(smoothed_me3) / colSums(smoothed_me3)) smoothed_ac <- t(t(smoothed_ac) / colSums(smoothed_ac)) ## correlation pstime <- bcell$B.cell.pseudotime[colnames(bins_me3)] peak_cor_me3 <- cor(t(as.matrix(smoothed_me3)), pstime)[, 1] peak_cor_ac <- cor(t(as.matrix(smoothed_ac)), pstime)[, 1] pv_me3 <- apply(smoothed_me3, 1, function(x) { cor.test(x, pstime)$p.value }) pv_ac <- apply(smoothed_ac, 1, function(x) { cor.test(x, pstime)$p.value }) cor_df <- data.frame(me3 = peak_cor_me3, ac = peak_cor_ac, p_me3 = pv_me3, p_adj_me3 = p.adjust(pv_me3), p_ac = pv_ac, p_adj_ac = p.adjust(pv_ac)) cor_df$diff <- cor_df$me3 - cor_df$ac # cor 0.2 and difference 0.5 tc <- cor_df[abs(cor_df$diff) > 0.5 & abs(cor_df$me3) > 0.2 & abs(cor_df$ac) > 0.2, ] # order cells by pseudotime porder <- order(pstime) ac_peaks <- smoothed_ac[rownames(tc), porder] me3_peaks <- smoothed_me3[rownames(tc), porder] # order based on which point in trajectory has max value using broader smoothing rowsmoothed <- t(roll_sum(as.matrix(t(me3_peaks)), n = 200)) rownames(rowsmoothed) <- rownames(me3_peaks) rowmax <- apply(rowsmoothed, 1, which.max) me3_peaks <- me3_peaks[order(rowmax), ] ac_peaks <- ac_peaks[order(rowmax), ] rowsmoothed_ac <- t(roll_sum(as.matrix(t(ac_peaks)), n = 100)) mm_ac <- t(apply(as.matrix(rowsmoothed_ac), 2, rev)) rowsmoothed_me3 <- t(roll_sum(as.matrix(t(me3_peaks)), n = 100)) mm_me3 <- t(apply(as.matrix(rowsmoothed_me3), 2, rev)) ac_max <- quantile(mm_ac, probs = seq(0, 1, 0.05)) me3_max <- quantile(mm_me3, probs = seq(0, 1, 0.05)) me3_plot <- mm_me3 me3_plot[me3_plot > me3_max[['95%']]] <- me3_max[['95%']] me3_plot[me3_plot < me3_max[['5%']]] <- me3_max[['5%']] ac_plot <- mm_ac ac_plot[ac_plot > ac_max[['95%']]] <- ac_max[['95%']] ac_plot[ac_plot < ac_max[['5%']]] <- ac_max[['5%']] cols.use <- rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(100)) png("./plots/hmap_ac.png", width = 5000, height = 5000, units = "px") image(ac_plot, col = cols.use, main = "", xaxt = 'n', yaxt = 'n', ylab = "", xlab = "") dev.off() png("./plots/hmap_me3.png", width = 5000, height = 5000, units = "px") image(me3_plot, col = cols.use, main = "", xaxt = 'n', yaxt = 'n', ylab = "", xlab = "") dev.off() # group peaks by whether they gain me3 or gain ac tc <- tc[order(tc$me3, decreasing = TRUE), ] peaks_me3 <- rownames(tc[tc$me3 > 0, ]) peaks_ac <- rownames(tc[tc$ac > 0, ]) cf_me3 <- ClosestFeature(bmmc, region = peaks_me3) cf_ac <- ClosestFeature(bmmc, region = peaks_ac) cf_me3 <- cf_me3[cf_me3$gene_biotype == "protein_coding", ] cf_ac <- cf_ac[cf_ac$gene_biotype == "protein_coding", ] maxdist <- 50000 closest_me3 <- unique(cf_me3[cf_me3$distance < maxdist, "gene_name"]) closest_ac <- unique(cf_ac[cf_ac$distance < maxdist, "gene_name"]) # save colorbar png("plots/bmmc/colorbar_heatmap.png", width = 10, height = 4, units = "in", res = 400) image(matrix(1:100), col=cols.use) dev.off() # RNA bmmc_rna <- readRDS("objects/fullref.Rds") ct.keep.rna <- c("HSC", "LMPP", "CLP", "pro B", "pre B", "transitional B", "Naive B", "Memory B", "Plasma") bmmc_rna_bcell <- bmmc_rna[, bmmc_rna$celltype.l2 %in% ct.keep.rna] # module score for each set of genes bmmc_rna_bcell <- AddModuleScore( object = bmmc_rna_bcell, features = list(setdiff(closest_me3, closest_ac)), name = "Repressed" ) bmmc_rna_bcell <- AddModuleScore( object = bmmc_rna_bcell, features = list(setdiff(closest_ac, closest_me3)), name = "Activated" ) Idents(bmmc_rna_bcell) <- "celltype.l2" levels(bmmc_rna_bcell) <- c("HSC", "LMPP", "CLP", "pro B", "pre B", "transitional B", "Naive B", "Memory B", "Plasma") v1 <- VlnPlot(bmmc_rna_bcell, "Repressed1", pt.size = 0) + NoLegend() + geom_hline(yintercept = 0) + ylab("Module expression") + xlab("") + ylim(c(-0.15, 0.15)) + ggtitle("Repressed genes") + theme_classic() + theme(legend.position = "none", text = element_text(size = 20)) + scale_x_discrete(guide = guide_axis(angle = 45)) v2 <- VlnPlot(bmmc_rna_bcell, "Activated1", pt.size = 0) + NoLegend() + geom_hline(yintercept = 0) + ylab("Module expression") + xlab("") + ylim(c(-0.15, 0.15)) + ggtitle("Activated genes") + theme_classic() + theme(legend.position = "none", text = element_text(size = 20)) + scale_x_discrete(guide = guide_axis(angle = 45)) ggsave(filename = "plots/bmmc/rna_repressed.pdf", plot = v1, height = 4, width = 5) ggsave(filename = "plots/bmmc/rna_activated.pdf", plot = v2, height = 4, width = 5) # add DE test between module scores at start vs end of trajectory module_activated <- bmmc_rna_bcell$Activated1 module_repressed <- bmmc_rna_bcell$Repressed1 progenitor <- colnames(bmmc_rna_bcell)[bmmc_rna_bcell$celltype.l2 %in% c("HSC", "LMPP")] mature <- colnames(bmmc_rna_bcell)[bmmc_rna_bcell$celltype.l2 %in% c("Memory B", "Plasma")] t.test(module_activated[progenitor], module_activated[mature]) # <2.2e-16 t.test(module_repressed[progenitor], module_repressed[mature]) # <2.2e-16 ######################## # BMMC atac comparison ######################## bmmc_atac <- readRDS("objects/bmmc_atac.rds") # subset b cell lineage ct.keep <- c("01_HSC", "06_CLP.1", "15_CLP.2", "05_CMP.LMPP", "17_B", "18_Plasma") b_atac <- bmmc_atac[, bmmc_atac$BioClassification %in% ct.keep] DefaultAssay(b_atac) <- "ATAC" b_atac <- FindTopFeatures(b_atac, min.cutoff = 5) b_atac <- RunTFIDF(b_atac) b_atac <- RunSVD(b_atac) b_atac <- RunUMAP(b_atac, reduction = "lsi", dims = 2:20) b_atac <- FindNeighbors(b_atac, reduction = "lsi", dims = 2:20) knn_atac <- b_atac[['ATAC_nn']] # add numeric score for b trajectory b_atac$trajectory <- as.numeric(factor(b_atac$BioClassification, levels = c("01_HSC", "05_CMP.LMPP", "06_CLP.1", "15_CLP.2", "17_B", "18_Plasma"))) b_atac_traj <- b_atac$trajectory b_atac$b <- factor(b_atac$BioClassification, levels = c("01_HSC", "05_CMP.LMPP", "06_CLP.1", "15_CLP.2", "17_B", "18_Plasma")) Idents(b_atac) <- "b" levels(bcell) <- c('HSPC', 'GMP/CLP', 'Pre-B', 'B', 'Plasma') # check dynamic regions in atac dataset tc_regions_atac <- FeatureMatrix( fragments = Fragments(b_atac), features = rownames(tc), cells = colnames(b_atac) ) smoothed_atac <- knn_smooth(counts = tc_regions_atac[, colnames(knn_atac)], knn = knn_atac) smoothed_atac <- t(t(smoothed_atac) / colSums(smoothed_atac)) peak_cor_atac <- cor(t(as.matrix(smoothed_atac)), b_atac_traj)[, 1] pv_atac <- apply(smoothed_atac, 1, function(x) { cor.test(x, b_atac_traj)$p.value }) nc <- peak_cor_atac[abs(peak_cor_atac) < 0.05] ns <- p.adjust(pv_atac[names(nc)]) > 0.01 sum(ns) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | library(Signac) library(Seurat) atac <- readRDS("objects/bmmc_atac.rds") bmmc_dual <- readRDS("objects/bmmc_dual.rds") bmmc_atac <- atac[, atac$tissue == "BMMC"] DefaultAssay(bmmc_atac) <- "ATAC" bmmc_atac <- FindTopFeatures(bmmc_atac) bmmc_atac <- RunTFIDF(bmmc_atac) bmmc_atac <- RunSVD(bmmc_atac) bmmc_atac <- RunUMAP(bmmc_atac, reduction = "lsi", dims = 2:50) # quantify atac peaks in ntt counts <- FeatureMatrix( fragments = Fragments(bmmc_dual[['ac']])[[1]], features = granges(bmmc_atac[['ATAC']]), cells = colnames(bmmc_dual) ) bmmc_dual[['ATAC']] <- CreateChromatinAssay( counts = counts, fragments = Fragments(bmmc_dual[['ac']])[[1]] ) DefaultAssay(bmmc_dual) <- "ATAC" # normalize with reference IDF get_idf <- function(x) { rsums <- rowSums(x = x) idf <- ncol(x = x) / rsums idf <- log(1 + idf) return(idf) } idf <- get_idf(x = GetAssayData(bmmc_atac, assay = "ATAC", slot = "counts")) bmmc_dual <- RunTFIDF(bmmc_dual, idf = idf) # find transfer anchors anchors <- FindTransferAnchors( reference = bmmc_atac, query = bmmc_dual, reduction = "lsiproject", dims = 2:30, reference.reduction = "lsi", reference.assay = "ATAC", query.assay = "ATAC" ) pred <- TransferData( anchorset = anchors, refdata = bmmc_atac$BioClassification, weight.reduction = bmmc_dual[['lsi.me3']], dims = 2:50 ) write.table(pred, "datasets/bmmc_dual_predictions.tsv", quote = FALSE, sep = "\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 | library(Signac) library(Seurat) ac <- readRDS("data/ct_pro/H3K27ac.rds") Fragments(ac) <- NULL Fragments(ac) <- CreateFragmentObject( path = "data/ct_pro/H3K27ac_fragments.tsv.gz", cells = Cells(ac) ) saveRDS(object = ac, file = "data/ct_pro/H3K27ac_updated.rds") me <- readRDS("data/ct_pro/H3K27me3.rds") Fragments(me) <- NULL Fragments(me) <- CreateFragmentObject( path = "data/ct_pro/H3K27me3_fragments.tsv.gz", cells = Cells(me) ) saveRDS(object = me, file = "data/ct_pro/H3K27me3_updated.rds") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | import pysam from Bio import SeqIO import os import gzip from pathlib import Path from argparse import ArgumentParser import sys # demux using i5 index only # parse args parser = ArgumentParser(description='Read demultiplexer') parser.add_argument('--read1', help='Path to read1') parser.add_argument('--read_i5', help='Path to read2') parser.add_argument('--read2', help='Path to read3') parser.add_argument('--tn5', help='Path to Tn5 i5 index FASTA file') parser.add_argument('--output', help='Path to output directory') args = parser.parse_args() def hamming_distance(s1, s2): return sum(c1 != c2 for c1, c2 in zip(s1, s2)) def open_fastq(f): if os.path.isfile(f): if f.endswith(".gz"): f_open = gzip.open(f, "rt") # rt read text mode (decodes binary gzip) else: f_open = open(f, "r") return(f_open) else: raise Exception("File not found") def read_sample_barcodes(inpath, maxlen=8): all_bc = dict() for i in SeqIO.parse(open(inpath),'fasta'): all_bc[str(i.seq[:maxlen])] = i.id return(all_bc) def get_entry(f): return([f.readline(), f.readline(), f.readline(), f.readline()]) def extract_barcodes_simple(sequence, bc1_len=8, spacer_len=14): # version with fixed-length barcodes sequence = sequence.strip("\n") tn5_bc = sequence[:bc1_len] cell_bc = sequence[bc1_len+spacer_len:] if len(cell_bc) != 16: print(cell_bc) return((tn5_bc, cell_bc)) tn5_i5_barcodes = read_sample_barcodes(args.tn5, maxlen=8) # using first 8 bases of barcode r1_read = open_fastq(f=args.read1) i5_read = open_fastq(f=args.read_i5) r2_read = open_fastq(f=args.read2) # create dictionary with file handles outf = dict() outpath = Path(args.output) if not outpath.exists(): os.mkdir(outpath) unique_i5_marks = [tn5_i5_barcodes[key] for key in tn5_i5_barcodes.keys()] unique_i5_marks.append('unknown') for i5 in unique_i5_marks: # outputting uncompressed fastq is ~10x faster fname_1 = open(outpath / (i5 + ".R1.fastq"), "w+") fname_2 = open(outpath / (i5 + ".R2.fastq"), "w+") outf[i5] = (fname_1, fname_2) x = 0 while True: r1_entry = get_entry(r1_read) r2_entry = get_entry(r2_read) i5_entry = get_entry(i5_read) if r1_entry[0] == '': break # get barcodes tn5_barcode, cell_barcode = extract_barcodes_simple(sequence=i5_entry[1], bc1_len=8) if tn5_barcode in tn5_i5_barcodes.keys(): mark = tn5_i5_barcodes[tn5_barcode] else: # compute hamming distance hams = [hamming_distance(tn5_barcode, x) < 3 for x in tn5_i5_barcodes.keys()] if sum(hams) == 1: mark = tn5_i5_barcodes[list(tn5_i5_barcodes.keys())[hams.index(True)]] else: mark = "unknown" # add barcodes to r1 and r2 genomic bc_combination = "@" + cell_barcode + ":" + tn5_barcode + "+" r1_entry[0] = bc_combination + r1_entry[0][1:] r2_entry[0] = bc_combination + r2_entry[0][1:] if mark == "unknown": r1_outf = outf['unknown'][0] r2_outf = outf['unknown'][1] else: # write to correct output files based on barcode combination r1_outf = outf[mark][0] r2_outf = outf[mark][1] r1_outf.write("".join(r1_entry)) r2_outf.write("".join(r2_entry)) x += 1 if x % 1e6 == 0: print("Processed " + str(int(x/1e6)) + " million reads", file=sys.stderr, end="\r") # close all files for i in outf.keys(): outf[i][0].close() outf[i][1].close() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | library(GenomicRanges) me3_peaks <- c("ENCFF138FVQ", "ENCFF720AKK", "ENCFF407VKP","ENCFF588JEN", "ENCFF570DGJ", "ENCFF071JFL", "ENCFF276IMO") ac_peaks <- c("ENCFF240LSH", "ENCFF614FJO", "ENCFF449ITG", "ENCFF321NYQ", "ENCFF544QYY", "ENCFF059WXH", "ENCFF412NPA") bulk_me3 <- "data/encode/ENCFF291LVP.bed.gz" bulk_ac <- "data/encode/ENCFF832RWT.bed.gz" me3_bedfiles <- paste0("data/encode/", me3_peaks, ".bed.gz") ac_bedfiles <- paste0("data/encode/", ac_peaks, ".bed.gz") read_bed <- function(filepath) { df <- read.table(file = filepath, col.names = c("chr", "start", "end", "name", "score", "strand", "fold_change", "neg_log10pvalue_summit", "neg_log10qvalue_summit", "relative_summit_position")) gr <- makeGRangesFromDataFrame(df = df, keep.extra.columns = TRUE, starts.in.df.are.0based = TRUE) return(gr) } me3_ranges <- sapply(me3_bedfiles, read_bed) ac_ranges <- sapply(ac_bedfiles, read_bed) all_me3 <- Reduce(f = c, x = me3_ranges) me3 <- reduce(all_me3) me3 <- keepStandardChromosomes(me3, pruning.mode = "coarse") me3 <- me3[width(me3) < 20000] all_ac <- Reduce(f = c, x = ac_ranges) ac <- reduce(all_ac) ac <- keepStandardChromosomes(ac, pruning.mode = "coarse") ac <- ac[width(ac) < 20000] combined <- c(ac, me3) ac <- as.data.frame(ac) write.table(x = ac, file = "data/encode/h3k27ac.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) me3 <- as.data.frame(me3) write.table(x = me3, file = "data/encode/h3k27me3.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) combined <- as.data.frame(combined) write.table(x = combined, file = "data/encode/all.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) # bulk peaks ac_bulk <- read_bed(bulk_ac) me3_bulk <- read_bed(bulk_me3) all_bulk <- c(ac_bulk, me3_bulk) all_bulk <- as.data.frame(all_bulk) write.table(x = all_bulk, file = "data/encode/all_bulk.bed", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | library(Signac) library(ggplot2) bigWigs <- list( "RNAPII (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-Pol2.bw", "H3K27me3 (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-K27me.bw", "H3K27ac (multi)" = "data/HEK_K562_bulk/mapped/K562-plex-K27ac.bw", "RNAPII (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bw", "H3K27me3 (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-K27me.bw", "H3K27ac (mono)" = "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bw" ) bw <- BigwigTrack( region = "chr11-69900000-70910000", bigwig = bigWigs, y_label = "Normalized coverage", ymax = "q90", bigwig.scale = "common", smooth = 1000 ) + theme(legend.position = "none") bw <- bw + scale_fill_manual(values = c("#036C9A", "#D3145A", "#F98401", "#676767", "#676767", "#676767")) ggsave( filename = "plots/hek_k562_bulk/bulk_covplot_k562.png", plot = bw, height = 5, width = 10, units = "in" ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | library(GenomicRanges) regions_s5 <- read.table("data/k562_peaks/ENCFF053XYZ.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_s5 <- makeGRangesFromDataFrame(regions_s5, keep.extra.columns = TRUE) regions_s2 <- read.table("data/k562_peaks/ENCFF266OPF.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_s2 <- makeGRangesFromDataFrame(regions_s2, keep.extra.columns = TRUE) regions_rna <- reduce(c(regions_s2, regions_s5)) regions_rna <- as.data.frame(regions_rna) write.table(regions_rna, file = "data/k562_peaks/rna.bed", col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 | library(Signac) library(Seurat) library(ggplot2) library(patchwork) library(GenomeInfoDb) library(BSgenome.Hsapiens.UCSC.hg38) library(GenomicRanges) library(Rsamtools) obj <- readRDS("objects/hek_k562.rds") # dimplot for figure 1 p_ac <- DimPlot(obj, reduction = "umap.k27ac", pt.size = 0.1) + ggtitle("H3K27ac") + theme_void() + theme(legend.position = "none") p_me <- DimPlot(obj, reduction = "umap.k27me", pt.size = 0.1) + ggtitle("H3K2me3") + theme_void() + theme(legend.position = "none") p_pol <- DimPlot(obj, reduction = "umap.pol2", pt.size = 0.1) + ggtitle("RNAPII") + theme_void() + theme(legend.position = "none") p_wnn <- DimPlot(obj, reduction = "wnn.umap") + ggtitle("H3K27me3 + H3K27ac + RNAPII") + theme_classic() + xlab("UMAP 1") + ylab("UMAP 2") + theme(legend.position = "none") p1 <- ((p_ac / p_me / p_pol) | p_wnn) + plot_layout(widths = c(1, 2)) ggsave("plots/hek_k562/dimplot_hek_k562.pdf", plot = p1, height = 5, width = 8) # bulk data for comparison hek_mono_k27ac <- "data/HEK_K562_bulk/mapped/HEK-mono-K27ac.bw" hek_mono_k27me3 <- "data/HEK_K562_bulk/mapped/HEK-mono-K27me.bw" hek_mono_s2s5p <- "data/HEK_K562_bulk/mapped/HEK-mono-Pol2.bw" k562_mono_k27ac <- "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bw" k562_mono_k27me3 <- "data/HEK_K562_bulk/mapped/K562-mono-K27me.bw" k562_mono_s2s5p <- "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bw" DefaultAssay(obj) <- "K27ac" p <- CoveragePlot( object = obj, region = "SCMH1", idents = "K562", assay = list("K27ac", "K27me", "Pol2"), extend.upstream = 20000, extend.downstream = 100000, window = 50, split.assays = TRUE, peaks = FALSE, bigwig = list("H3K27ac" = k562_mono_k27ac, "H3K27me3" = k562_mono_k27me3, "RNAPII" = k562_mono_s2s5p) ) & NoLegend() colors.use <- list("#F98401", "#D3145A", "#036C9A") mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752") p <- p[[1]] p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use) p[[2]] <- p[[2]] & scale_fill_manual(values = c("darkgrey", "darkgrey", "darkgrey")) ggsave( filename = "plots/SCMH1_k562.png", plot = p, dpi = 600, units = "in", height = 5, width = 15 ) # additional regions for supplement p <- CoveragePlot( object = obj, region = "NASP", idents = "K562", assay = list("K27ac", "K27me", "Pol2"), extend.upstream = 20000, extend.downstream = 100000, window = 50, split.assays = TRUE, peaks = FALSE, bigwig = list("H3K27ac" = k562_mono_k27ac, "H3K27me3" = k562_mono_k27me3, "RNAPII" = k562_mono_s2s5p) ) & NoLegend() colors.use <- list("#F98401", "#D3145A", "#036C9A") mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752") p <- p[[1]] p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use) p[[2]] <- p[[2]] & scale_fill_manual(values = c("darkgrey", "darkgrey", "darkgrey")) ggsave( filename = "plots/NASP_k562.png", plot = p, dpi = 600, units = "in", height = 5, width = 15 ) p <- CoveragePlot( object = obj, region = "FAF1", idents = "K562", assay = list("K27ac", "K27me", "Pol2"), extend.upstream = 50000, extend.downstream = 100000, window = 50, split.assays = TRUE, peaks = FALSE, bigwig = list("H3K27ac" = k562_mono_k27ac, "H3K27me3" = k562_mono_k27me3, "RNAPII" = k562_mono_s2s5p) ) & NoLegend() colors.use <- list("#F98401", "#D3145A", "#036C9A") mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752") p <- p[[1]] p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use) p[[2]] <- p[[2]] & scale_fill_manual(values = c("darkgrey", "darkgrey", "darkgrey")) ggsave( filename = "plots/FAF1_k562.png", plot = p, dpi = 600, units = "in", height = 5, width = 15 ) p <- CoveragePlot( object = obj, region = "SDAD1", idents = "K562", assay = list("K27ac", "K27me", "Pol2"), extend.upstream = 100000, extend.downstream = 100000, window = 50, split.assays = TRUE, peaks = FALSE, bigwig = list("H3K27ac" = k562_mono_k27ac, "H3K27me3" = k562_mono_k27me3, "RNAPII" = k562_mono_s2s5p) ) & NoLegend() colors.use <- list("#F98401", "#D3145A", "#036C9A") mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752") p <- p[[1]] p[[1]] <- p[[1]] + scale_fill_manual(values = colors.use) p[[2]] <- p[[2]] & scale_fill_manual(values = c("darkgrey", "darkgrey", "darkgrey")) ggsave( filename = "plots/SDAD1_k562.png", plot = p, dpi = 600, units = "in", height = 5, width = 15 ) # multiCUT&Tag comparison # same me3_me3 <- "data/multict/fragments/H3K27me3-H3K27me3.tsv.gz" ac_ac <- "data/multict/fragments/H3K27ac-H3K27ac.tsv.gz" # mixed ac_me3 <- "data/multict/fragments/H3K27ac-H3K27me3.tsv.gz" me3_ac <- "data/multict/fragments/H3K27me3-H3K27ac.tsv.gz" totals_mct <- CountFragments( fragments = list(me3_me3, ac_ac, ac_me3, me3_ac) ) cells_keep_mct <- totals_mct[totals_mct$frequency_count > 200, "CB"] # count fragments for each mark total_me3_mct <- CountFragments(fragments = me3_me3, cells = cells_keep_mct) total_ac_mct <- CountFragments(fragments = ac_ac, cells = cells_keep_mct) # count mixed 1/2 for each total_me3_mixed_mct <- CountFragments(fragments = me3_ac, cells = cells_keep_mct) total_ac_mixed_mct <- CountFragments(fragments = ac_me3, cells = cells_keep_mct) total_me3_mixed_mct$frequency_count <- total_me3_mixed_mct$frequency_count / 2 total_ac_mixed_mct$frequency_count <- total_ac_mixed_mct$frequency_count / 2 mct_counts_me3 <- total_me3_mct$frequency_count names(mct_counts_me3) <- total_me3_mct$CB mct_counts_me3[total_me3_mixed_mct$CB] <- mct_counts_me3[total_me3_mixed_mct$CB] + total_me3_mixed_mct$frequency_count mct_counts_me3[total_ac_mixed_mct$CB] <- mct_counts_me3[total_ac_mixed_mct$CB] + total_ac_mixed_mct$frequency_count mct_counts_ac <- total_ac_mct$frequency_count names(mct_counts_ac) <- total_ac_mct$CB mct_counts_ac[total_ac_mixed_mct$CB] <- mct_counts_ac[total_ac_mixed_mct$CB] + total_ac_mixed_mct$frequency_count mct_counts_ac[total_me3_mixed_mct$CB] <- mct_counts_ac[total_me3_mixed_mct$CB] + total_me3_mixed_mct$frequency_count # same for reads mct_reads_me3 <- total_me3_mct$reads_count names(mct_reads_me3) <- total_me3_mct$CB mct_reads_me3[total_me3_mixed_mct$CB] <- mct_reads_me3[total_me3_mixed_mct$CB] + total_me3_mixed_mct$reads_count mct_reads_me3[total_ac_mixed_mct$CB] <- mct_reads_me3[total_ac_mixed_mct$CB] + total_ac_mixed_mct$reads_count mct_reads_ac <- total_ac_mct$reads_count names(mct_reads_ac) <- total_ac_mct$CB mct_reads_ac[total_ac_mixed_mct$CB] <- mct_reads_ac[total_ac_mixed_mct$CB] + total_ac_mixed_mct$reads_count mct_reads_ac[total_me3_mixed_mct$CB] <- mct_reads_ac[total_me3_mixed_mct$CB] + total_me3_mixed_mct$reads_count count_df <- data.frame(count = c(mct_counts_me3, mct_counts_ac), mark = c(rep("H3K27me3", length(mct_counts_me3)), rep("H3K27ac", length(mct_counts_ac))), dataset = "multiCUT&Tag", reads = c(mct_reads_me3, mct_reads_ac) ) # ntt total_me3 <- CountFragments( fragments = "data/HEK_K562_sc/sinto/K27me.bed.gz", cells = colnames(obj) ) total_ac <- CountFragments( fragments = "data/HEK_K562_sc/sinto/K27ac.bed.gz", cells = colnames(obj) ) total_pol <- CountFragments( fragments = "data/HEK_K562_sc/sinto/Pol2.bed.gz", cells = colnames(obj) ) # construct total vectors me3 <- total_me3$frequency_count names(me3) <- total_me3$CB df1 <- data.frame( count = me3, mark = "H3K27me3", dataset = "scNTT-seq" ) df1$reads <- total_me3$reads_count ac <- total_ac$frequency_count names(ac) <- total_ac$CB df2 <- data.frame( count = ac, mark = "H3K27ac", dataset = "scNTT-seq" ) df2$reads <- total_ac$reads_count pol <- total_pol$frequency_count names(pol) <- total_pol$CB df3 <- data.frame( count = pol, mark = "RNAPII", dataset = "scNTT-seq" ) df3$reads <- total_pol$reads_count count_df <- rbind(count_df, df1, df2, df3) count_df <- rbind(count_df, data.frame(count=c(1,1), mark="RNAPII", dataset="multiCUT&Tag", reads=c(1,1))) # total fragments, split by mark p3 <- ggplot(data = count_df, mapping = aes(mark, reads, fill=dataset)) + geom_boxplot(outlier.size = 0.1) + scale_y_log10() + theme_bw() + ylab("Total reads per cell") + xlab("Antibody") + theme(legend.position = "none") + ggtitle("Reads per cell") p4 <- ggplot(data = count_df, mapping = aes(mark, count, fill=dataset)) + geom_boxplot(outlier.size = 0.1) + scale_y_log10() + theme_bw() + ylab("Total fragments per cell") + xlab("Antibody") + ggtitle("Fragments per cell") ggsave( filename = "plots/hek_k562/sensitivity.png", plot = (p3 | p4), height = 4, width = 7, dpi = 500 ) # Region heatmaps atac_peaks <- rtracklayer::import("data/K562_ATAC/ENCFF006OFA.bigBed") atac_peaks <- reduce(atac_peaks) chr.keep <- c(paste0("chr", 1:22), "chrX") atac_peaks <- keepSeqlevels(atac_peaks, value = chr.keep, pruning.mode = "coarse") atac_peaks <- subsetByOverlaps(atac_peaks, ranges = blacklist_hg38_unified, invert = TRUE) regions_me3 <- read.table("data/k562_peaks/ENCFF031FSF.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE) regions_me3$peak <- "H3K27me3" regions_ac <- read.table("data/k562_peaks/ENCFF038DDS.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE) regions_ac$peak <- "H3K27ac" regions_s5 <- read.table("data/k562_peaks/ENCFF053XYZ.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_s5 <- makeGRangesFromDataFrame(regions_s5, keep.extra.columns = TRUE) regions_s2 <- read.table("data/k562_peaks/ENCFF266OPF.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_s2 <- makeGRangesFromDataFrame(regions_s2, keep.extra.columns = TRUE) regions_rna <- reduce(c(regions_s2, regions_s5)) regions_rna$peak <- "RNAPII" regions <- c(regions_me3, regions_ac, regions_rna) regions <- keepStandardChromosomes(regions, pruning.mode = "coarse") regions <- dropSeqlevels(regions, value = "chrM", pruning.mode = "coarse") regions <- dropSeqlevels(regions, value = "chrY", pruning.mode = "coarse") atac_chr1 <- atac_peaks[seqnames(atac_peaks) == 'chr1'] me3_peaks <- regions_me3[seqnames(regions_me3) == 'chr1'] ac_peaks <- regions_ac[seqnames(regions_ac) == 'chr1'] pol2_peaks <- regions_rna[seqnames(regions_rna) == 'chr1'] # ATAC obj <- RegionMatrix( object = obj, regions = atac_chr1, key = "ATAC", assay = "K27ac" ) obj <- RegionMatrix( object = obj, regions = atac_chr1, key = "ATAC", assay = "K27me" ) obj <- RegionMatrix( object = obj, regions = atac_chr1, key = "ATAC", assay = "Pol2" ) # me3 obj <- RegionMatrix( object = obj, regions = me3_peaks, key = "me3", assay = "K27ac" ) obj <- RegionMatrix( object = obj, regions = me3_peaks, key = "me3", assay = "K27me" ) obj <- RegionMatrix( object = obj, regions = me3_peaks, key = "me3", assay = "Pol2" ) # ac obj <- RegionMatrix( object = obj, regions = ac_peaks, key = "ac", assay = "K27ac" ) obj <- RegionMatrix( object = obj, regions = ac_peaks, key = "ac", assay = "K27me" ) obj <- RegionMatrix( object = obj, regions = ac_peaks, key = "ac", assay = "Pol2" ) # pol2 obj <- RegionMatrix( object = obj, regions = pol2_peaks, key = "pol2", assay = "K27ac" ) obj <- RegionMatrix( object = obj, regions = pol2_peaks, key = "pol2", assay = "K27me" ) obj <- RegionMatrix( object = obj, regions = pol2_peaks, key = "pol2", assay = "Pol2" ) # plots cm <- list("K27ac" = "#F98401", "K27me" = "#D3145A", "Pol2" = "#036C9A") p4 <- RegionHeatmap( object = obj, assay = c("K27ac", "K27me", "Pol2"), cols = cm, key = "ATAC", min.counts = 20, normalize = FALSE, window = 50, nrow = 1, idents = "K562" ) & ylab("ATAC peaks") p5 <- RegionHeatmap( object = obj, assay = c("K27me", "K27ac", "Pol2"), cols = cm, key = "me3", min.counts = 20, normalize = FALSE, window = 50, nrow = 1, idents = "K562" ) & ylab("H3K27me3 peaks") p6 <- RegionHeatmap( object = obj, assay = c("K27ac", "K27me", "Pol2"), cols = cm, key = "ac", min.counts = 20, normalize = FALSE, window = 50, nrow = 1, idents = "K562" ) & ylab("H3K27ac peaks") p7 <- RegionHeatmap( object = obj, assay = c("Pol2", "K27ac", "K27me"), cols = cm, key = "pol2", min.counts = 20, normalize = FALSE, window = 50, nrow = 1, idents = "K562" ) & ylab("RNAPII peaks") update_plot <- function(p, y) { p[[1]] <- p[[1]] + ylab(y) p[[2]] <- p[[2]] + theme(axis.title.y=element_blank(), axis.text.y=element_blank()) p[[3]] <- p[[3]] + theme(axis.title.y=element_blank(), axis.text.y=element_blank()) return(p) } p4 <- update_plot(p4, y = "ATAC peaks") p5 <- update_plot(p5, y = "H3K27me3 peaks") p6 <- update_plot(p6, y = "H3K27ac peaks") p7 <- update_plot(p7, y = "RNAPII peaks") pp <- (p4 | p5 | p6 | p7) & theme(legend.position = "none") ggsave(filename = "plots/hek_k562/region_heatmaps.png", plot = pp, height = 6, width = 12) # genome-wide correlation plot # bulk fragfiles k562_mono_k27ac_frag <- "data/HEK_K562_bulk/mapped/K562-mono-K27ac.bed.gz" k562_mono_k27me3_frag <- "data/HEK_K562_bulk/mapped/K562-mono-K27me.bed.gz" k562_mono_s2s5p_frag <- "data/HEK_K562_bulk/mapped/K562-mono-Pol2.bed.gz" # split single-cell sc_k27_ac <- "data/HEK_K562_sc/split/K27ac_k562_filter.bed.gz" sc_k27_me <- "data/HEK_K562_sc/split/K27me_k562_filter.bed.gz" sc_pol <- "data/HEK_K562_sc/split/Pol2_k562_filter.bed.gz" QuantifyRegions <- function(fragfile, regions) { results <- vector(mode = "numeric", length = length(regions)) # get total fragments for normalization # just read whole thing f <- read.table(fragfile, sep = "\t", header = FALSE) total <- nrow(f) rm(f) gc() # open tabix connection tabix.file <- TabixFile(file = fragfile) open(con = tabix.file) # get fragments in each region for (j in seq_along(regions)) { reads <- scanTabix(file = tabix.file, param = regions[j]) results[j] <- sum(sapply(X = reads, FUN = length)) } # close connection close(con = tabix.file) cmp <- (results / total) * 10^6 return(cmp) } # need bulk vs bulk # bulk vs single cell # single cell vs single cell k27ac_cov <- QuantifyRegions(k562_mono_k27ac_frag, regions) k27me3_cov <- QuantifyRegions(k562_mono_k27me3_frag, regions) pol2_cov <- QuantifyRegions(k562_mono_s2s5p_frag, regions) pol2_sc_cov <- QuantifyRegions(sc_pol, regions) ac_sc_cov <- QuantifyRegions(sc_k27_ac, regions) me_sc_cov <- QuantifyRegions(sc_k27_me, regions) # create matrix mat <- cbind(k27ac_cov, ac_sc_cov, k27me3_cov, me_sc_cov, pol2_cov, pol2_sc_cov) correlation_matrix <- cor(mat) colnames(correlation_matrix) <- c("Bulk H3K27ac", "sc H3K27ac", "Bulk H3K27me3", "sc H3K27me3", "Bulk RNAPII", "sc RNAPII") rownames(correlation_matrix) <- colnames(correlation_matrix) df <- as.data.frame(as.table(correlation_matrix)) p8 <- ggplot(df, aes(Var1, Var2, fill = Freq)) + geom_tile() + scale_fill_distiller(palette = "RdYlBu") + xlab("") + ylab("") + guides(fill = guide_legend(title = "Pearson\ncorrelation")) + theme_bw() + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) ggsave(filename = "plots/hek_k562/correlation.png", plot = p8, height = 5, width = 6) # scatterplots # 1. use encode peak regions for each mark # 2. plot one mark vs another # 3. fit linear model get_cod_expression <- function(m) { eq <- substitute(~~italic(R)^2~"="~r2, list(r2 = format(summary(m)$r.squared, digits = 2))) cod <- as.expression(eq) return(cod) } mat <- as.data.frame(mat) mat$region <- regions$peak colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A") # bulk-bulk m.use <- mat[mat$region %in% c("H3K27me3", "H3K27ac"), ] bulk_ac_me3 <- ggplot(m.use, aes(k27ac_cov, k27me3_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (bulk, single antibody)") + ylab("H3K27me3 (bulk, single antibody)") + ggtitle(get_cod_expression(lm(k27ac_cov ~ k27me3_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("RNAPII", "H3K27ac"), ] bulk_ac_pol <- ggplot(m.use, aes(k27ac_cov, pol2_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 300)) + ylim(c(0, 300)) + xlab("H3K27ac (bulk, single antibody)") + ylab("RNAPII (bulk, single antibody)") + ggtitle(get_cod_expression(lm(k27ac_cov ~ pol2_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("RNAPII", "H3K27me3"), ] bulk_pol_me <- ggplot(m.use, aes(pol2_cov, k27me3_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("RNAPII (bulk, single antibody)") + ylab("H3K27me3 (bulk, single antibody)") + ggtitle(get_cod_expression(lm(pol2_cov ~ k27me3_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) # bulk-sc m.use <- mat[mat$region %in% c("H3K27ac", "H3K27me3"), ] bulk_ac_sc_me3 <- ggplot(m.use, aes(k27ac_cov, me_sc_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 800)) + ylim(c(0, 800)) + xlab("H3K27ac (bulk, single antibody)") + ylab("H3K27me3 (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(k27ac_cov ~ me_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("H3K27ac", "RNAPII"), ] bulk_ac_sc_pol <- ggplot(m.use, aes(k27ac_cov, pol2_sc_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (bulk, single antibody)") + ylab("RNAPII (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(k27ac_cov ~ pol2_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("H3K27me3", "RNAPII"), ] bulk_pol_sc_me <- ggplot(m.use, aes(pol2_cov, me_sc_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("RNAPII (bulk, single antibody)") + ylab("H3K27me3 (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(pol2_cov ~ me_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("H3K27ac"), ] bulk_ac_sc_ac <- ggplot(m.use, aes(k27ac_cov, ac_sc_cov, color = region)) + geom_point(size = 0.1) + theme_classic() + scale_color_manual(values = colormap) + xlim(c(0, 600)) + ylim(c(0, 600)) + xlab("H3K27ac (bulk, single antibody)") + ylab("H3K27ac (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(k27ac_cov ~ ac_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("H3K27me3"), ] bulk_me_sc_me <- ggplot(m.use, aes(k27me3_cov, me_sc_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 300)) + ylim(c(0, 300)) + xlab("H3K27me3 (bulk, single antibody)") + ylab("H3K27me3 (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(k27me3_cov ~ me_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("RNAPII"), ] bulk_pol_sc_pol <- ggplot(m.use, aes(pol2_cov, pol2_sc_cov, color = region)) + geom_point(size = 0.1) + theme_classic() + scale_color_manual(values = colormap) + xlim(c(0, 200)) + ylim(c(0, 200)) + xlab("RNAPII (bulk, single antibody)") + ylab("RNAPII (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(pol2_cov ~ pol2_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) # sc-sc m.use <- mat[mat$region %in% c("H3K27ac", "H3K27me3"), ] sc_ac_sc_me3 <- ggplot(m.use, aes(ac_sc_cov, me_sc_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (single-cell, multiplexed)") + ylab("H3K27me3 (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("H3K27ac", "RNAPII"), ] sc_ac_sc_pol <- ggplot(m.use, aes(ac_sc_cov, pol2_sc_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (single-cell, multiplexed)") + ylab("RNAPII (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(ac_sc_cov ~ pol2_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region %in% c("H3K27me3", "RNAPII"), ] sc_pol_sc_me <- ggplot(m.use, aes(pol2_sc_cov, me_sc_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 200)) + ylim(c(0, 200)) + xlab("RNAPII (single-cell, multiplexed)") + ylab("H3K27me3 (single-cell, multiplexed)") + ggtitle(get_cod_expression(lm(pol2_sc_cov ~ me_sc_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) pp <- (bulk_ac_me3 / bulk_ac_pol / bulk_pol_me) | (bulk_ac_sc_me3 / bulk_ac_sc_pol / bulk_pol_sc_me) | (sc_ac_sc_me3 / sc_ac_sc_pol / sc_pol_sc_me) | (bulk_me_sc_me / bulk_ac_sc_ac / bulk_pol_sc_pol) pp <- pp & NoLegend() ggsave(filename = "plots/hek_k562/scatterplots.png", plot = pp, height = 10, width = 12) # ternary plot for single-cell data if (!requireNamespace(package = "ggtern", quietly = TRUE)) { install.packages("ggtern") } library(ggtern) region_counts <- mat[, c("region", "ac_sc_cov", "me_sc_cov", "pol2_sc_cov")] colnames(region_counts) <- c("region", "H3K27ac", "H3K27me3", "RNAPII") # randomize plotting order region_counts <- region_counts[sample(1:nrow(region_counts)), ] gp <- ggtern(region_counts, aes(x = H3K27ac, y = H3K27me3, z = RNAPII, color = region)) + geom_point(size = 0.1, alpha = 0.2) + scale_color_manual(values = colors.use) + theme_classic() ggsave(filename = "plots/hek_k562/ternary.png", plot = gp, height = 8, width = 8) ## KNN purity ## if (!requireNamespace(package = "RANN", quietly = TRUE)) { install.packages("RANN") } library(RANN) knn_purity_emb <- function(embeddings, dims, clusters, k) { nn <- nn2(data = embeddings[, dims], k = k + 1)$nn.idx[, 2:k] nn_purity <- vector(mode = "numeric", length = length(x = clusters)) for (i in seq_len(length.out = nrow(x = nn))) { nn_purity[i] <- sum(clusters[nn[i, ]] == clusters[i]) / k } return(nn_purity) } knn_purity_graph <- function(graph, clusters) { # pre-existing graph, can't change k nn_purity <- vector(mode = "numeric", length = length(x = clusters)) for (i in seq_len(length.out = nrow(x = graph))) { nbr <- names(which(graph[i, ] == 1)) nn_purity[i] <- sum(clusters[nbr] == clusters[i]) / sum(graph[i, ]) } return(nn_purity) } # compute wnn for each combination of modalities obj <- FindMultiModalNeighbors( object = obj, knn.graph.name = "wknn_all", reduction.list = list("lsi.k27ac", "lsi.k27me", "lsi.pol2"), dims.list = list(2:10, 2:10, 2:10) ) obj <- FindMultiModalNeighbors( object = obj, knn.graph.name = "wknn_ac_me3", reduction.list = list("lsi.k27ac", "lsi.k27me"), dims.list = list(2:10, 2:10) ) obj <- FindMultiModalNeighbors( object = obj, knn.graph.name = "wknn_ac_pol2", reduction.list = list("lsi.k27ac", "lsi.pol2"), dims.list = list(2:10, 2:10) ) obj <- FindMultiModalNeighbors( object = obj, knn.graph.name = "wknn_me3_pol2", reduction.list = list("lsi.k27me", "lsi.pol2"), dims.list = list(2:10, 2:10) ) dims <- 2:10 k <- 50 emb_me3 <- Embeddings(obj[['lsi.k27me']]) emb_ac <- Embeddings(obj[['lsi.k27ac']]) emb_pol <- Embeddings(obj[['lsi.pol2']]) ct <- obj$celltype kme3 <- data.frame( purity = knn_purity_emb(embeddings = emb_me3, dims = dims, clusters = ct, k = k), assay = "H3K27me3", celltype = ct ) kac <- data.frame( purity = knn_purity_emb(embeddings = emb_ac, dims = dims, clusters = ct, k = k), assay = "H3K27ac", celltype = ct ) kpol <- data.frame( purity = knn_purity_emb(embeddings = emb_pol, dims = dims, clusters = ct, k = k), assay = "RNAPII", celltype = ct ) kme3_ac <- data.frame( purity = knn_purity_graph(graph = obj[['wknn_ac_me3']], clusters = ct), assay = "H3K27me3 + H3K27ac", celltype = ct ) kac_pol <- data.frame( purity = knn_purity_graph(graph = obj[['wknn_ac_pol2']], clusters = ct), assay = "H3K27ac + RNAPII", celltype = ct ) kpol_me <- data.frame( purity = knn_purity_graph(graph = obj[['wknn_me3_pol2']], clusters = ct), assay = "H3K27me3 + RNAPII", celltype = ct ) kpol_me_ac <- data.frame( purity = knn_purity_graph(graph = obj[['wknn_all']], clusters = ct), assay = "H3K27me3 + H3K27ac + RNAPII", celltype = ct ) kdf <- rbind(kme3, kac, kpol, kme3_ac, kac_pol, kpol_me, kpol_me_ac) kdf$assay <- factor(kdf$assay, levels = c("RNAPII", "H3K27ac", "H3K27me3", "H3K27ac + RNAPII", "H3K27me3 + RNAPII", "H3K27me3 + H3K27ac", "H3K27me3 + H3K27ac + RNAPII")) knn_p <- ggplot(kdf, aes(y = assay, x = purity, fill = assay)) + ggridges::geom_density_ridges(bandwidth = 0.008) + xlab("Fraction of neighbors of same cell type") + ylab("Assay") + theme_classic() + scale_fill_manual(values = c("#036C9A", "#F98401", "#D3145A", "#7e784e", "#6b407a", "#e64c2e", "#9a5752")) + xlim(c(0.8, 1.01)) + theme(legend.position = "none", text = element_text(size = 12)) + geom_vline(xintercept = 1) ggsave(filename = "plots/hek_k562/knn_purity_wnn.png", plot = knn_p, height = 4, width = 7) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | library(Signac) library(ggplot2) library(patchwork) library(Rsamtools) QuantifyRegions <- function(fragfile, regions) { results <- vector(mode = "numeric", length = length(regions)) # get total fragments for normalization # just read whole thing f <- read.table(fragfile, sep = "\t", header = FALSE) total <- nrow(f) rm(f) gc() # open tabix connection tabix.file <- TabixFile(file = fragfile) open(con = tabix.file) # get fragments in each region for (j in seq_along(regions)) { reads <- scanTabix(file = tabix.file, param = regions[j]) results[j] <- sum(sapply(X = reads, FUN = length)) } # close connection close(con = tabix.file) cmp <- (results / total) * 10^6 return(cmp) } get_cod_expression <- function(m) { eq <- substitute(~~italic(R)^2~"="~r2, list(r2 = format(summary(m)$r.squared, digits = 2))) cod <- as.expression(eq) return(cod) } colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401") # multiCUT&Tag # same me3_me3 <- "data/multict/fragments/H3K27me3-H3K27me3.tsv.gz" ac_ac <- "data/multict/fragments/H3K27ac-H3K27ac.tsv.gz" # mixed ac_me3 <- "data/multict/fragments/H3K27ac-H3K27me3.tsv.gz" me3_ac <- "data/multict/fragments/H3K27me3-H3K27ac.tsv.gz" # peaks h3k27me3 <- read.table("data/mesc/ENCFF008XKX.bed.gz", sep = "\t") # H3K27me3 ES-Bruce4 mm10 h3k27ac <- read.table("data/mesc/ENCFF360VIS.bed.gz", sep = "\t") # H3K27ac ES-Bruce4 mm10 h3k27me3 <- makeGRangesFromDataFrame( df = h3k27me3, seqnames.field = "V1", start.field = "V2", end.field = "V3" ) h3k27ac <- makeGRangesFromDataFrame( df = h3k27ac, seqnames.field = "V1", start.field = "V2", end.field = "V3" ) h3k27me3 <- keepStandardChromosomes(h3k27me3, pruning.mode = "coarse") h3k27ac <- keepStandardChromosomes(h3k27ac, pruning.mode = "coarse") totals_mct <- CountFragments( fragments = list(me3_me3, ac_ac, ac_me3, me3_ac) ) cells_keep_mct <- totals_mct[totals_mct$frequency_count > 200, "CB"] me3_me3_filt <- "data/multict/fragments/H3K27me3-H3K27me3_filtered.tsv.gz" ac_ac_filt <- "data/multict/fragments/H3K27ac-H3K27ac_filtered.tsv.gz" ac_me3_filt <- "data/multict/fragments/H3K27ac-H3K27me3_filtered.tsv.gz" me3_ac_filt <- "data/multict/fragments/H3K27me3-H3K27ac_filtered.tsv.gz" # filter cells FilterCells( fragments = me3_me3, cells = cells_keep_mct, outfile = me3_me3_filt ) FilterCells( fragments = ac_ac, cells = cells_keep_mct, outfile = ac_ac_filt ) FilterCells( fragments = ac_me3, cells = cells_keep_mct, outfile = ac_me3_filt ) FilterCells( fragments = me3_ac, cells = cells_keep_mct, outfile = me3_ac_filt ) h3k27me3$peak <- "H3K27me3" h3k27ac$peak <- "H3K27ac" regions <- c(h3k27me3, h3k27ac) ########### # scatter # ########### # quantify regions me3_me3 <- QuantifyRegions(me3_me3_filt, regions) me3_ac <- QuantifyRegions(me3_ac_filt, regions) ac_ac <- QuantifyRegions(ac_ac_filt, regions) ac_me3 <- QuantifyRegions(ac_me3_filt, regions) mat <- cbind(me3_me3, me3_ac, ac_ac, ac_me3) mat <- as.data.frame(mat) mat$region <- regions$peak mat$me3 <- mat$me3_me3 + (mat$me3_ac/2) + (mat$ac_me3/2) mat$ac <- mat$ac_ac + (mat$me3_ac/2) + (mat$ac_me3/2) p_me3_ac <- ggplot(mat, aes(x = ac, y = me3, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + theme(legend.position = 'none') + xlim(c(0, 400)) + ylim(c(0, 400)) + ylab("H3K27me3") + xlab("H3K27ac") + ggtitle(get_cod_expression(lm(me3 ~ ac, data = mat))) + geom_abline(intercept = 0, slope = 1, linetype = 2) ggsave(filename = "plots/hek_k562/multi_cuttag_scatter.png", plot = p_me3_ac, height = 4, width = 4) ############ ### FRiP ### ############ get_total <- function(frags, cells) { tot <- CountFragments(fragments = frags@path, cells = cells) totals <- tot$frequency_count names(totals) <- tot$CB return(totals) } get_in_peaks <- function(obj, regions) { counts <- FeatureMatrix( fragments = obj, features = regions, cells = obj@cells ) return(colSums(counts)) } me3_me3_obj <- CreateFragmentObject(me3_me3_filt, cells = cells_keep_mct) ac_ac_obj <- CreateFragmentObject(ac_ac_filt, cells = cells_keep_mct) me3_ac_obj <- CreateFragmentObject(me3_ac_filt, cells = cells_keep_mct) ac_me3_obj <- CreateFragmentObject(ac_me3_filt, cells = cells_keep_mct) totals_me3_me3 <- get_total(me3_me3_obj, cells = cells_keep_mct) totals_ac_me3 <- get_total(ac_me3_obj, cells = cells_keep_mct) totals_me3_ac <- get_total(me3_ac_obj, cells = cells_keep_mct) totals_ac_ac <- get_total(ac_ac_obj, cells = cells_keep_mct) in_me3_peak_me3_me3 <- get_in_peaks(obj = me3_me3_obj, regions = h3k27me3) in_me3_peak_ac_ac <- get_in_peaks(obj = ac_ac_obj, regions = h3k27me3) in_me3_peak_me3_ac <- get_in_peaks(obj = me3_ac_obj, regions = h3k27me3) in_me3_peak_ac_me3 <- get_in_peaks(obj = ac_me3_obj, regions = h3k27me3) in_ac_peak_me3_me3 <- get_in_peaks(obj = me3_me3_obj, regions = h3k27ac) in_ac_peak_ac_ac <- get_in_peaks(obj = ac_ac_obj, regions = h3k27ac) in_ac_peak_me3_ac <- get_in_peaks(obj = me3_ac_obj, regions = h3k27ac) in_ac_peak_ac_me3 <- get_in_peaks(obj = ac_me3_obj, regions = h3k27ac) me3_frip <- data.frame( cell = names(totals_me3_me3), total = totals_me3_me3 + (totals_ac_me3/2) + (totals_me3_ac/2), in_peak = in_me3_peak_me3_me3 + (in_me3_peak_me3_ac/2) + (in_me3_peak_ac_me3/2), assay = "H3K27me3", peak = "H3K27me3" ) me3_frip$FRiP <- me3_frip$in_peak / me3_frip$total ac_frip <- data.frame( cell = names(totals_ac_ac), total = totals_ac_ac + (totals_ac_me3/2) + (totals_me3_ac/2), in_peak = in_ac_peak_ac_ac + (in_ac_peak_me3_ac/2) + (in_ac_peak_ac_me3/2), assay = "H3K27ac", peak = "H3K27ac" ) ac_frip$FRiP <- ac_frip$in_peak / ac_frip$total df <- rbind(me3_frip, ac_frip) p <- ggplot(df, aes(x = assay, y = FRiP, fill = assay)) + geom_boxplot(outlier.size = 0.5) + scale_fill_manual(values = colormap) + xlab("Antibody") + ggtitle("multiCUT&Tag FRiP") + theme_bw() + theme(legend.position = "none") ggsave(filename = "plots/hek_k562/multi_cuttag_frip.png", plot = p, height = 4, width = 3) saveRDS(object = df, file = "objects/frip_mct.rds") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | library(Seurat) library(Signac) library(EnsDb.Hsapiens.v86) library(BSgenome.Hsapiens.UCSC.hg38) library(future) library(ggplot2) library(patchwork) args <- commandArgs(trailingOnly = TRUE) threads <- args[[1]] outfile <- args[[2]] fpath <- args[[3]] plan("multicore", workers = as.integer(threads)) options(future.globals.maxSize = Inf) # annotation annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86) seqlevelsStyle(annot) <- "UCSC" genome.use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:23] fragfiles <- list.files( path = fpath, pattern = "*.bed.gz$", full.names = TRUE ) fname <- gsub(pattern = paste0(fpath, "/"), replacement = "", x = fragfiles) fname <- gsub(pattern = ".bed.gz", replacement = "", x = fname) names(fragfiles) <- fname total_counts <- CountFragments(fragments = as.list(fragfiles)) cells.keep <- total_counts[total_counts$frequency_count > 150, "CB"] create_assay <- function(frag, cells, genome.use) { frag_obj <- CreateFragmentObject( path = frag, cells = cells ) counts <- AggregateTiles( object = frag_obj, cells = cells, min_counts = 1, binsize = 10000, genome = genome.use ) return(counts) } all.counts <- lapply( X = fragfiles, FUN = create_assay, cells = cells.keep, genome.use = genome.use ) # combine matrices for same barcode pairs names(all.counts) <- fname all.cells <- lapply(all.counts, colnames) common.cells <- Reduce(intersect, all.cells) all.counts <- lapply(all.counts, function(x) x[, common.cells]) combined.matrix <- do.call(what = rbind, args = all.counts) # create seurat object obj <- CreateSeuratObject( counts = combined.matrix, min.cells = 10, assay = "all" ) obj <- obj[, obj$nCount_all < 10000 & obj$nCount_all > 100] for (i in seq_along(all.counts)) { mat <- all.counts[[i]][, colnames(obj)] assay <- CreateChromatinAssay( counts = mat, min.features = -1, fragments = fragfiles[[i]], annotation = annot ) obj[[names(all.counts)[i]]] <- assay } obj <- subset(obj, subset = nCount_K27ac > 75 & nCount_K27me > 150 & nCount_Pol2 > 100) # process obj <- RunTFIDF(obj) obj <- FindTopFeatures(obj, min.cutoff = 50) obj <- RunSVD(obj) obj <- RunUMAP(obj, reduction = 'lsi', dims = 2:10) # create lsi, umap from each assay DefaultAssay(obj) <- "K27ac" obj <- RunTFIDF(obj) obj <- FindTopFeatures(obj, min.cutoff = 10) obj <- RunSVD(obj, reduction.name = "lsi.k27ac") obj <- RunUMAP(obj, reduction = 'lsi.k27ac', reduction.name = "umap.k27ac", dims = 2:10) DefaultAssay(obj) <- "K27me" obj <- RunTFIDF(obj) obj <- FindTopFeatures(obj, min.cutoff = 10) obj <- RunSVD(obj, reduction.name = "lsi.k27me") obj <- RunUMAP(obj, reduction = 'lsi.k27me', reduction.name = "umap.k27me", dims = 2:10) DefaultAssay(obj) <- "Pol2" obj <- RunTFIDF(obj) obj <- FindTopFeatures(obj, min.cutoff = 10) obj <- RunSVD(obj, reduction.name = "lsi.pol2") obj <- RunUMAP(obj, reduction = 'lsi.pol2', reduction.name = "umap.pol2", dims = 2:10) # WNN obj <- FindMultiModalNeighbors( object = obj, reduction.list = list("lsi.k27ac", "lsi.k27me", "lsi.pol2"), dims.list = list(2:10, 2:10, 2:10) ) obj <- RunUMAP( object = obj, nn.name = "weighted.nn", reduction.name = "wnn.umap" ) obj <- FindClusters( object = obj, graph.name = "wsnn", algorithm = 3, resolution = 0.05 ) obj <- RenameIdents(obj, list("0" = "K562", "1" = "HEK")) obj$celltype <- Idents(obj) # save object saveRDS(object = obj, file = outfile) # write cell names for vireo writeLines(colnames(obj), "data/HEK_K562_sc/cells.txt") # split fragment files by cell type hek <- WhichCells(object = obj, idents = "HEK") k562 <- WhichCells(object = obj, idents = "K562") filter_chrom <- function(infile, outfile, chrom.keep) { # open input and output files con <- gzfile(infile, 'rb') outf <- file(description = outfile, "w") # read line by line, writing to output file while (TRUE) { line = readLines(con, n = 1) if (length(line) == 0) { break } else if (substring(text = line, first = 1, last = 1) == "#") { next # skip header } else { # check chromosome fields <- unlist(x = strsplit(x = line, "\t")) if (!(fields[[1]] %in% chrom.keep)) { next } else { writeLines(text = line, con = outf) } } } close(con) close(outf) system( command = paste0("bgzip ", outfile), wait = TRUE, ignore.stderr = FALSE, ignore.stdout = FALSE ) system( command = paste0("tabix -p bed ", paste0(outfile, '.gz')), wait = TRUE, ignore.stderr = FALSE, ignore.stdout = FALSE ) } chr.use <- names(genome.use) for (i in seq_along(fragfiles)) { outf <- paste0("data/HEK_K562_sc/split/", names(fragfiles)[[i]]) FilterCells( fragments = fragfiles[[i]], cells = k562, outfile = paste0(outf, "_k562.bed.gz") ) FilterCells( fragments = fragfiles[[i]], cells = hek, outfile = paste0(outf, "_hek.bed.gz") ) # filter chromosomes filter_chrom( infile = paste0(outf, "_k562.bed.gz"), outfile = paste0(outf, "_k562_filter.bed"), chrom.keep = chr.use ) filter_chrom( infile = paste0(outf, "_hek.bed.gz"), outfile = paste0(outf, "_hek_filter.bed"), chrom.keep = chr.use ) } |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | library(Signac) library(Seurat) library(EnsDb.Hsapiens.v86) annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86) seqlevelsStyle(annot) <- "UCSC" me3 <- list("data/henikoff/GSM5034342_K27me3_R1_PBMC.fragments.HG38.tsv.gz", "data/henikoff/GSM5034343_K27me3_R2_PBMC.fragments.HG38.tsv.gz") ac <- "data/henikoff/GSM5034344_K27ac_PBMC.fragments.HG38.tsv.gz" # count fragments per cell frags_me3_1 <- CountFragments(me3[[1]]) frags_me3_2 <- CountFragments(me3[[2]]) frags_ac <- CountFragments(ac) min_frags <- 500 cells_me3_1 <- frags_me3_1[frags_me3_1$frequency_count > min_frags, "CB"] cells_me3_2 <- frags_me3_2[frags_me3_2$frequency_count > min_frags, "CB"] cells_ac <- frags_ac[frags_ac$frequency_count > min_frags, "CB"] f_me3_1 <- CreateFragmentObject(me3[[1]], cells = cells_me3_1) f_me3_2 <- CreateFragmentObject(me3[[2]], cells = cells_me3_2) f_ac <- CreateFragmentObject(ac, cells = cells_ac) me3_counts_1 <- FeatureMatrix( fragments = f_me3_1, features = granges(pbmc[['me3']]), cells = cells_me3_1 ) me3_counts_2 <- FeatureMatrix( fragments = f_me3_2, features = granges(pbmc[['me3']]), cells = cells_me3_2 ) ac_counts <- FeatureMatrix( fragments = f_ac, features = granges(pbmc[['ac']]), cells = cells_ac ) # change cell barcodes to add batch info colnames(me3_counts_1) <- paste0(colnames(me3_counts_1), ":1") colnames(me3_counts_2) <- paste0(colnames(me3_counts_2), ":2") names(f_me3_1@cells) <- paste0(names(f_me3_1@cells), ":1") names(f_me3_2@cells) <- paste0(names(f_me3_2@cells), ":2") me3 <- CreateSeuratObject( counts = CreateChromatinAssay( counts = cbind(me3_counts_1, me3_counts_2), fragments = list(f_me3_1, f_me3_2), annotation = annot ), names.field = 2, names.delim = ":", assay = "me3", meta.data = md ) me3 <- FindTopFeatures(me3) me3 <- RunTFIDF(me3) me3 <- RunSVD(me3, scale.embeddings = TRUE, reduction.name = 'lsi.me3') me3 <- RunUMAP(me3, reduction = 'lsi.me3', dims = 2:30, reduction.name = 'umap.me3') saveRDS(object = me3, file = "objects/henikoff/me3_filt.rds") ac <- CreateSeuratObject( counts = CreateChromatinAssay( counts = ac_counts, fragments = ac, annotation = annot ), assay = "ac", meta.data = md_ac ) ac <- FindTopFeatures(ac) ac <- RunTFIDF(ac) ac <- RunSVD(ac, scale.embeddings = TRUE, reduction.name = 'lsi.ac') ac <- RunUMAP(ac, reduction = 'lsi.ac', dims = 2:30, reduction.name = 'umap.ac') saveRDS(object = ac, file = "objects/henikoff/ac_filt.rds") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | library(Signac) library(ggplot2) bigWigs <- list( "H3K27me3 (multi)" = "data/pbmc_bulk/mapped/plex/me3.bw", "H3K27ac (multi)" = "data/pbmc_bulk/mapped/plex/ac.bw", "H3K27me3 (mono)" = "data/pbmc_bulk/mapped/mono/me3.bw", "H3K27ac (mono)" = "data/pbmc_bulk/mapped/mono/ac.bw" ) bw <- BigwigTrack( region = "chr1-156630171-156920171", bigwig = bigWigs, y_label = "Normalized coverage", ymax = "q90", bigwig.scale = "common", smooth = 1000 ) + theme(legend.position = "none") bw <- bw + scale_fill_manual(values = c("#D3145A", "#F98401", "#676767", "#676767")) ggsave("plots/pbmc_bulk/bulk_covplot_pbmc.png", plot = bw, height = 5, width = 10, units = "in") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | library(ggplot2) library(GenomicRanges) library(Rsamtools) getFrip <- function(fragfile, regions) { results <- vector(mode = "numeric", length = length(regions)) # open tabix connection tabix.file <- TabixFile(file = fragfile) open(con = tabix.file) in_peak <- countTabix(tabix.file, param = regions) total_in_peak <- sum(unlist(in_peak)) # close connection close(con = tabix.file) total <- system( command = paste0("gzip -dc ", fragfile, " | wc -l"), wait = TRUE, intern = TRUE ) total <- as.numeric(total) frip <- total_in_peak / total return(frip) } # bulk fragments me3_mono <- "data/pbmc_bulk/mapped/mono/me3.bed.gz" ac_mono <- "data/pbmc_bulk/mapped/mono/ac.bed.gz" me3_plex <- "data/pbmc_bulk/mapped/plex/me3.bed.gz" ac_plex <- "data/pbmc_bulk/mapped/plex/ac.bed.gz" regions_ac <- read.table("data/encode/ENCFF832RWT.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE) regions_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE) frip_me3_mono_me3_peak <- getFrip(fragfile = me3_mono, regions = regions_me3) frip_me3_plex_me3_peak <- getFrip(fragfile = me3_plex, regions = regions_me3) frip_ac_mono_me3_peak <- getFrip(fragfile = ac_mono, regions = regions_me3) frip_ac_plex_me3_peak <- getFrip(fragfile = ac_plex, regions = regions_me3) frip_me3_mono_ac_peak <- getFrip(fragfile = me3_mono, regions = regions_ac) frip_me3_plex_ac_peak <- getFrip(fragfile = me3_plex, regions = regions_ac) frip_ac_mono_ac_peak <- getFrip(fragfile = ac_mono, regions = regions_ac) frip_ac_plex_ac_peak <- getFrip(fragfile = ac_plex, regions = regions_ac) me3_peak <- data.frame( frip = c(frip_me3_mono_me3_peak, frip_me3_plex_me3_peak, frip_ac_mono_me3_peak, frip_ac_plex_me3_peak), assay = c("H3K27me3_mono", "H3K27me3_plex", "H3K27ac_mono", "H3K27ac_plex") ) me3_peak$assay <- factor(me3_peak$assay, levels = c("H3K27me3_plex", "H3K27ac_plex", "H3K27me3_mono", "H3K27ac_mono")) p1 <- ggplot(me3_peak, aes(assay, frip, fill = assay)) + geom_bar(stat = "identity") + scale_fill_manual(values = c("#D3145A", "#F98401", "#676767", "#676767")) + theme_bw() + theme(legend.position = "none") + ylab("FRiP") + ggtitle("Fraction of reads in H3K27me3 peaks") ac_peak <- data.frame( frip = c(frip_me3_mono_ac_peak, frip_me3_plex_ac_peak, frip_ac_mono_ac_peak, frip_ac_plex_ac_peak), assay = c("H3K27me3_mono", "H3K27me3_plex", "H3K27ac_mono", "H3K27ac_plex") ) ac_peak$assay <- factor(ac_peak$assay, levels = c("H3K27me3_plex", "H3K27ac_plex", "H3K27me3_mono", "H3K27ac_mono")) p2 <- ggplot(ac_peak, aes(assay, frip, fill = assay)) + geom_bar(stat = "identity") + scale_fill_manual(values = c("#D3145A", "#F98401", "#676767", "#676767")) + theme_bw() + theme(legend.position = "none") + ylab("FRiP") + ggtitle("Fraction of reads in H3K27ac peaks") ggsave(filename = "plots/pbmc_bulk/frip_me3.png", plot = p1, height = 4, width = 4) ggsave(filename = "plots/pbmc_bulk/frip_ac.png", plot = p2, height = 4, width = 4) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | library(ggplot2) library(patchwork) library(GenomicRanges) library(Rsamtools) me3_mono <- "data/pbmc_bulk/mapped/mono/me3.bed.gz" ac_mono <- "data/pbmc_bulk/mapped/mono/ac.bed.gz" me3_plex <- "data/pbmc_bulk/mapped/plex/me3.bed.gz" ac_plex <- "data/pbmc_bulk/mapped/plex/ac.bed.gz" regions_ac <- read.table("data/encode/ENCFF832RWT.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE) regions_ac$peak <- "H3K27ac" regions_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz", sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE) regions_me3$peak <- "H3K27me3" regions <- c(regions_me3, regions_ac) QuantifyRegions <- function(fragfile, regions) { results <- vector(mode = "numeric", length = length(regions)) # get total fragments for normalization # just read whole thing f <- read.table(fragfile, sep = "\t", header = FALSE) total <- nrow(f) rm(f) gc() # open tabix connection tabix.file <- TabixFile(file = fragfile) open(con = tabix.file) # get fragments in each region for (j in seq_along(regions)) { reads <- scanTabix(file = tabix.file, param = regions[j]) results[j] <- sum(sapply(X = reads, FUN = length)) } # close connection close(con = tabix.file) cmp <- (results / total) * 10^6 return(cmp) } get_cod_expression <- function(m) { eq <- substitute(~~italic(R)^2~"="~r2, list(r2 = format(summary(m)$r.squared, digits = 2))) cod <- as.expression(eq) return(cod) } ac_plex_cov <- QuantifyRegions(ac_plex, regions) ac_mono_cov <- QuantifyRegions(ac_mono, regions) me3_plex_cov <- QuantifyRegions(me3_plex, regions) me3_mono_cov <- QuantifyRegions(me3_mono, regions) mat <- cbind(ac_plex_cov, ac_mono_cov, me3_plex_cov, me3_mono_cov) mat <- as.data.frame(mat) mat$region <- regions$peak colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A") mat <- mat[sample(nrow(mat)), ] # p-m m.use <- mat[mat$region == "H3K27ac", ] ac_plex_mono <- ggplot( data = m.use, mapping = aes(ac_plex_cov, ac_mono_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 800)) + ylim(c(0, 800)) + xlab("H3K27ac (multiple antibody)") + ylab("H3K27ac (single antibody)") + ggtitle(get_cod_expression(lm(ac_plex_cov ~ ac_mono_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) m.use <- mat[mat$region == "H3K27me3", ] me3_plex_mono <- ggplot( data = m.use, mapping = aes(me3_plex_cov, me3_mono_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 100)) + ylim(c(0, 100)) + xlab("H3K27me3 (multiple antibody)") + ylab("H3K27me3 (single antibody)") + ggtitle(get_cod_expression(lm(me3_plex_cov ~ me3_mono_cov, data = m.use))) + geom_abline(intercept = 0, slope = 1, linetype = 2) ac_plex_me3_mono <- ggplot( data = mat, mapping = aes(ac_plex_cov, me3_mono_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (multiple antibody)") + ylab("H3K27me3 (single antibody)") + ggtitle(get_cod_expression(lm(ac_plex_cov ~ me3_mono_cov, data = mat))) + geom_abline(intercept = 0, slope = 1, linetype = 2) ac_mono_me3_plex <- ggplot( data = mat, mapping = aes(ac_mono_cov, me3_plex_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (single antibody)") + ylab("H3K27me3 (multiple antibody)") + ggtitle(get_cod_expression(lm(ac_mono_cov ~ me3_plex_cov, data = mat))) + geom_abline(intercept = 0, slope = 1, linetype = 2) # m-m ac_mono_me3_mono <- ggplot( data = mat, mapping = aes(ac_mono_cov, me3_mono_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (single antibody)") + ylab("H3K27me3 (single antibody)") + ggtitle(get_cod_expression(lm(ac_mono_cov ~ me3_mono_cov, data = mat))) + geom_abline(intercept = 0, slope = 1, linetype = 2) # p-p ac_plex_me3_plex <- ggplot( data = mat, mapping = aes(ac_plex_cov, me3_plex_cov, color = region)) + geom_point(size = 0.1) + scale_color_manual(values = colormap) + theme_classic() + xlim(c(0, 400)) + ylim(c(0, 400)) + xlab("H3K27ac (multiple antibody)") + ylab("H3K27me3 (multiple antibody)") + ggtitle(get_cod_expression(lm(ac_plex_cov ~ me3_plex_cov, data = mat))) + geom_abline(intercept = 0, slope = 1, linetype = 2) pp <- (ac_mono_me3_mono | ac_mono_me3_plex | ac_plex_me3_mono | ac_plex_me3_plex | ac_plex_mono | me3_plex_mono) & theme(legend.position = "none") ggsave(filename = "plots/pbmc/bulk_scatter.png", plot = pp, height = 3.5, width = 16, dpi = 600) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | library(Signac) library(Seurat) library(ggplot2) library(patchwork) library(dplyr) args <- commandArgs(trailingOnly = TRUE) ntt <- readRDS(args[[1]]) ct_ac <- readRDS(args[[2]]) ct_me <- readRDS(args[[3]]) pbmc_sc <- readRDS(args[[4]]) colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A") mixed <- list("mp"= "#6b407a", "ma" = "#e64c2e", "ap" = "#7e784e", "all" = "#9a5752") # QC plots me3_frag <- Fragments(ntt[['me3']])[[1]]@path ac_frag <- Fragments(ntt[['ac']])[[1]]@path tot_me3 <- CountFragments( fragments = me3_frag, cells = colnames(ntt) ) tot_me3$mark <- "H3K27me3" tot_ac <- CountFragments( fragments = ac_frag, cells = colnames(ntt) ) tot_ac$mark <- "H3K27ac" all_counts <- rbind(tot_me3, tot_ac) mean(tot_me3$frequency_count) # 2854 mean(tot_ac$frequency_count) # 412 sd(tot_me3$frequency_count) # 2953 sd(tot_ac$frequency_count) # 356 # pbmc replicate me3_frag_2 <- Fragments(pbmc_sc[['me3']])[[1]]@path ac_frag_2 <- Fragments(pbmc_sc[['ac']])[[1]]@path tot_me3_2 <- CountFragments( fragments = me3_frag_2, cells = colnames(pbmc_sc) ) tot_me3_2$mark <- "H3K27me3" tot_ac_2 <- CountFragments( fragments = ac_frag_2, cells = colnames(pbmc_sc) ) tot_ac_2$mark <- "H3K27ac" all_counts_2 <- rbind(tot_me3_2, tot_ac_2) mean(tot_me3_2$frequency_count) # 670 mean(tot_ac_2$frequency_count) # 731 sd(tot_me3_2$frequency_count) # 1243 sd(tot_ac_2$frequency_count) # 1035 all_counts$dataset <- "scNTT-seq" all_counts_2$dataset <- "scNTT-seq" # ct henikoff me3_frag_ct <- Fragments(ct_me[['tiles']])[[1]]@path ac_frag_ct <- Fragments(ct_ac[["tiles"]])[[1]]@path tot_me3_ct <- CountFragments( fragments = me3_frag_ct, cells = colnames(ct_me) ) tot_me3_ct$mark <- "H3K27me3" tot_ac_ct <- CountFragments( fragments = ac_frag_ct, cells = colnames(ct_ac) ) tot_ac_ct$mark <- "H3K27ac" all_counts_ct <- rbind(tot_me3_ct, tot_ac_ct) mean(tot_me3_ct$frequency_count) # 519 mean(tot_ac_ct$frequency_count) # 283 sd(tot_me3_ct$frequency_count) # 383 sd(tot_ac_ct$frequency_count) # 272 all_counts_ct$dataset <- "scCUT&Tag" all_counts <- rbind(all_counts, all_counts_ct) p0 <- ggplot(all_counts, aes(mark, frequency_count, fill = dataset)) + geom_violin(size = 0.2) + scale_y_log10() + theme_bw() + ylab("Fragments") + xlab("Mark") + ggtitle("Total fragments per cell") p01 <- ggplot(all_counts, aes(mark, reads_count, fill = dataset)) + geom_violin(size = 0.2) + scale_y_log10() + theme_bw() + ylab("Reads") + xlab("Mark") + ggtitle("Total reads per cell") + theme(legend.position = "none") ggsave(filename = "plots/pbmc/fragments.png", plot = p01 | p0, height = 4, width = 7) # replicate 2 p0 <- ggplot(all_counts_2, aes(mark, frequency_count, fill = mark)) + geom_violin(size = 0.2) + scale_y_log10() + theme_bw() + scale_fill_manual(values = colormap[1:2]) + ylab("Fragments") + xlab("Mark") + ggtitle("Total fragments per cell") + theme(legend.position = "none") p01 <- ggplot(all_counts_2, aes(mark, reads_count, fill = mark)) + geom_violin(size = 0.2) + scale_y_log10() + theme_bw() + scale_fill_manual(values = colormap[1:2]) + ylab("Reads") + xlab("Mark") + ggtitle("Total reads per cell") + theme(legend.position = "none") ggsave(filename = "plots/pbmc/fragments_rep2.png", plot = p01 | p0, height = 4, width = 7) t1 <- TSSPlot(ntt, assay = "me3", group.by = "orig.ident") + ggtitle("H3K27me3") + scale_color_manual(values = colormap$H3K27me3) t2 <- TSSPlot(ntt, assay = "ac", group.by = "orig.ident") + ggtitle("H3K27ac") + scale_color_manual(values = colormap$H3K27ac) tt <- (t1 | t2) & ylim(c(0, 7)) ggsave(filename = "plots/pbmc/tss.png", plot = tt, height = 8, width = 5) p1 <- DimPlot(ntt, reduction = "umap.me3", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("H3K27me3") + NoLegend() p2 <- DimPlot(ntt, reduction = "umap.ac", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("H3K27ac") + NoLegend() p3 <- DimPlot(ntt, reduction = "umap.adt", label = TRUE, label.size = 3, repel = TRUE, pt.size = 0.1) + theme_void() + ggtitle("ADT") + NoLegend() p4 <- DimPlot(ntt, reduction = "umap.wnn", label = TRUE, label.size = 4) + ggtitle("PBMCs: H3K27me3 + H3K27ac + protein") + theme_classic() + NoLegend() + ylab("UMAP 1") + xlab("UMAP 2") pp <- ((p1 / p2 / p3) | p4) + plot_layout(widths = c(1, 2)) ggsave(filename = "plots/pbmc/dimplots.png", plot = pp, height = 5, width = 7) # wnn weight v1 <- VlnPlot(ntt, c("ac.weight", "me3.weight", "ADT.weight"), ncol = 1, pt.size = 0) ggsave(filename = "plots/pbmc/wnn_weight.png", plot = v1, height = 8, width = 12) # adt sensitivity df <- ntt[[]] df$dataset <- "scNTT-seq" df <- df[, c("nCount_ADT", "dataset")] mean(df$nCount_ADT) # 690 sd(df$nCount_ADT) # 613 df2 <- ct_me[[]] df2$dataset <- "CUT&Tag-pro (H3K27me3)" df2 <- df2[, c("nCount_ADT", "dataset")] df3 <- ct_ac[[]] df3$dataset <- "CUT&Tag-pro (H3K27ac)" df3 <- df3[, c("nCount_ADT", "dataset")] df <- rbind(df, df2, df3) v2 <- ggplot(data = df, aes(dataset, nCount_ADT, fill = dataset)) + geom_violin(size = 0.2) + scale_y_log10() + theme_bw() + theme(axis.title.x=element_blank(), axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), legend.position = 'none') + ggtitle("ADT counts per cell") ggsave(filename = "plots/pbmc/ncount_adt.png", plot = v2, height = 4, width = 2.5) # featureplots adt DefaultAssay(ntt) <- "ADT" fp <- FeaturePlot( ntt, c("CD14", "CD19", "CD4", "CD8A", "IL2RB", "TFRC"), pt.size = 0.1, max.cutoff = 'q99', cols = c("lightgrey", "darkgreen"), ncol = 3, reduction = 'umap.wnn' ) & xlab("UMAP 1") & ylab("UMAP 2") & NoLegend() ggsave(filename = "plots/pbmc/featureplots.png", plot = fp, height = 6, width = 8) DefaultAssay(ntt) <- "me3" cp <- CoveragePlot( object = ntt, region = "PAX5", assay = list("me3", "ac"), split.assays = FALSE, assay.scale = "separate", features = c("CD19"), expression.assay = "ADT", idents = c("CD14+ Mono", "B cell"), extend.upstream = -30000, extend.downstream = 10000, window = 500, peaks = FALSE ) cp[[1]][[1]] <- cp[[1]][[1]] + scale_fill_manual(values = c(colormap$H3K27me3, colormap$H3K27ac)) ggsave(filename = "plots/pbmc/pax5_covplot.png", plot = cp, height = 3, width = 8) # colorscale cols <- colorRampPalette(c("lightgrey", "darkgreen"))(100) png("plots/pbmc/colorbar.png", width = 10, height = 4, units = "in", res = 400) image(matrix(1:100), col=cols) dev.off() cp <- CoveragePlot( object = ntt, region = "CD33", assay = list("me3", "ac"), split.assays = FALSE, assay.scale = "separate", features = c("CD33"), expression.assay = "ADT", idents = c("CD14+ Mono", "B cell"), extend.upstream = 2000, extend.downstream = 2000, window = 500, peaks = FALSE ) cp[[1]][[1]] <- cp[[1]][[1]] + scale_fill_manual(values = c(colormap$H3K27me3, colormap$H3K27ac)) ggsave(filename = "plots/pbmc/cd33_covplot.png", plot = cp, height = 3, width = 8) # run WNN on pair of chromatin assays, quantify how well structure matches protein neighbor graph ntt <- FindMultiModalNeighbors( object = ntt, reduction.list = list("lsi.me3", "lsi.ac"), dims.list = list(2:30, 2:30), knn.graph.name = "wknn.chrom", weighted.nn.name = "wnn.chrom", modality.weight.name = "wnn.chrom" ) # compute KNN purity using ADT-defined cell types, compare different graphs knn_purity_graph <- function(graph, clusters) { # pre-existing graph, can't change k nn_purity <- vector(mode = "numeric", length = length(x = clusters)) for (i in seq_len(length.out = nrow(x = graph))) { nbr <- names(which(graph[i, ] == 1)) nn_purity[i] <- sum(clusters[nbr] == clusters[i]) / sum(graph[i, ]) } return(nn_purity) } clusters <- ntt$celltype ntt <- FindNeighbors(ntt, reduction = "lsi.me3", dims = 2:30, k.param = 100) ntt <- FindNeighbors(ntt, reduction = "lsi.ac", dims = 2:30, k.param = 100) ntt <- FindNeighbors(ntt, reduction = "pca", dims = 1:30, k.param = 100) graph.me3 <- ntt[['me3_nn']] graph.ac <- ntt[['ac_nn']] graph.chrom <- ntt[['wknn.chrom']] graph.adt <- ntt[['ADT_nn']] graph.wnn <- ntt[['wknn']] kme3 <- data.frame( purity = knn_purity_graph(graph = graph.me3, clusters = clusters), assay = "H3K27me3", celltype = clusters ) kac <- data.frame( purity = knn_purity_graph(graph = graph.ac, clusters = clusters), assay = "H3K27ac", celltype = clusters ) kchrom <- data.frame( purity = knn_purity_graph(graph = graph.chrom, clusters = clusters), assay = "H3K27me3 + H3K27ac", celltype = clusters ) kadt <- data.frame( purity = knn_purity_graph(graph = graph.adt, clusters = clusters), assay = "ADT", celltype = clusters ) kwnn <- data.frame( purity = knn_purity_graph(graph = graph.wnn, clusters = clusters), assay = "ADT + H3K27me3 + H3K27ac", celltype = clusters ) kdf <- rbind(kme3, kac, kchrom, kadt) kdf$assay <- factor(kdf$assay, levels = c("H3K27ac", "H3K27me3", "H3K27me3 + H3K27ac", "ADT")) df <- kdf %>% group_by(assay) %>% mutate(total_025 = sum(purity < 0.25) / n()) %>% select(assay, total_025) %>% unique() knn_p <- ggplot(df, aes(assay, total_025, fill = assay)) + geom_bar(stat = "identity") + theme_classic() + theme(legend.position = "none") + xlab("Assay") + ylab("Fraction of cells with \n<25% neighbors same celltype") + scale_x_discrete(guide = guide_axis(angle = 45)) + scale_fill_manual( values = c(colormap$H3K27ac, colormap$H3K27me3, mixed$ma, "darkgreen") ) ggsave(filename = "plots/pbmc/knn_purity.pdf", plot = knn_p, height = 4, width = 3) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | library(ggplot2) library(patchwork) mat_ac <- read.table("data/pbmc_protein/bigwig/raw_ac.tsv", sep = "\t", header = TRUE, comment.char = "") mat_ac <- mat_ac[, 4:ncol(mat_ac)] # remove region mat_ac <- as.matrix(mat_ac) mat_ac[is.nan(mat_ac)] <- 0 mat_ac <- as.data.frame(mat_ac) mat_me3 <- read.table("data/pbmc_protein/bigwig/raw_me3.tsv", sep = "\t", header = TRUE, comment.char = "") mat_me3 <- mat_me3[, 4:ncol(mat_me3)] # remove region mat_me3 <- as.matrix(mat_me3) mat_me3[is.nan(mat_me3)] <- 0 mat_me3 <- as.data.frame(mat_me3) encode <- list("ENCFF569IPX.bigWig" = "NK_H3K27me3_chip", "ENCFF293ETP.bigWig" = "NK_H3K27ac_chip", "ENCFF842JLZ.bigWig" = "CD8_memory_H3K27me3_chip", "ENCFF611GRL.bigWig" = "CD8_memory_H3K27ac_chip", "ENCFF181OXO.bigWig" = "CD4_H3K27ac_chip", "ENCFF811VAX.bigWig" = "CD4_H3K27me3_chip", "ENCFF526VJO.bigWig" = "CD8_H3K27ac_chip", "ENCFF499VWN.bigWig" = "CD8_H3K27me3_chip", "ENCFF777EGG.bigWig" = "CD14_mono_H3K27me3_chip", "ENCFF601NLG.bigWig" = "CD14_mono_H3K27ac_chip", "ENCFF951ZBV.bigWig" = "B_cell_H3K27ac_chip", "ENCFF046VLL.bigWig" = "B_cell_H3K27me3_chip", "ENCFF085YMB.bigWig" = "CMP_CD34_H3K27me3_chip", "ENCFF064JOI.bigWig" = "CMP_CD34_H3K27ac_chip") process_matrix <- function(mat, encode, method = "pearson") { encode_mat <- colnames(mat) %in% names(encode) colnames(mat)[encode_mat] <- unname(encode[colnames(mat)[encode_mat]]) colnames(mat) <- gsub(".bw", "", colnames(mat), fixed = TRUE) # work out me3 or ac if (any(grepl("me3", colnames(mat)))) { mat <- mat[, c("B_cell_me3", "B_cell_H3K27me3_chip", "NK_me3", "NK_H3K27me3_chip", "CD14_Mono_me3", "CD14_mono_H3K27me3_chip", "CD8_T_cell_me3", "CD8_H3K27me3_chip", "CD4_T_cell_me3", "CD4_H3K27me3_chip", "Late_erythroid_me3", "CMP_CD34_H3K27me3_chip")] } else { mat <- mat[, c("B_cell_ac", "B_cell_H3K27ac_chip", "NK_ac", "NK_H3K27ac_chip", "CD14_Mono_ac", "CD14_mono_H3K27ac_chip", "CD8_T_cell_ac", "CD8_H3K27ac_chip", "CD4_T_cell_ac", "CD4_H3K27ac_chip", "Late_erythroid_ac", "CMP_CD34_H3K27ac_chip")] } all_cor <- cor(mat, method = method) ac <- reshape2::melt(all_cor) ac <- ac[ac$Var1 %in% encode, ] ac <- ac[ac$Var2 %in% setdiff(colnames(mat), encode), ] ac$`Pearson correlation` <- ac$value return(ac) } me3 <- process_matrix(mat=mat_me3, encode=encode, method = "spearman") ac <- process_matrix(mat=mat_ac, encode=encode, method = "spearman") cells_keep <- c("CD14_Mono_me3", "Late_erythroid_me3", "B_cell_me3") encode_keep <- c("B_cell_H3K27me3_chip", "CD14_mono_H3K27me3_chip", "CMP_CD34_H3K27me3_chip") me3 <- me3[me3$Var2 %in% cells_keep, ] me3 <- me3[me3$Var1 %in% encode_keep, ] cells_keep <- c("CD14_Mono_ac", "Late_erythroid_ac", "B_cell_ac") encode_keep <- c("B_cell_H3K27ac_chip", "CD14_mono_H3K27ac_chip", "CMP_CD34_H3K27ac_chip") ac <- ac[ac$Var2 %in% cells_keep, ] ac <- ac[ac$Var1 %in% encode_keep, ] replacement_encode <- list("B_cell_H3K27ac_chip" = "B cell", "CD14_mono_H3K27ac_chip" = "CD14 Monocyte", "CMP_CD34_H3K27ac_chip" = "CD34+ CMP") replacement_sc <- list("B_cell_ac" = "B cell", "CD14_Mono_ac" = "CD14 Monocyte", "Late_erythroid_ac" = "Late_erythroid") ac$Var1 <- as.character(replacement_encode[as.character(ac$Var1)]) ac$Var2 <- as.character(replacement_sc[as.character(ac$Var2)]) replacement_encode <- list("B_cell_H3K27me3_chip" = "B cell", "CD14_mono_H3K27me3_chip" = "CD14 Monocyte", "CMP_CD34_H3K27me3_chip" = "CD34+ CMP") replacement_sc <- list("B_cell_me3" = "B cell", "CD14_Mono_me3" = "CD14 Monocyte", "Late_erythroid_me3" = "Late_erythroid") me3$Var1 <- as.character(replacement_encode[as.character(me3$Var1)]) me3$Var2 <- as.character(replacement_sc[as.character(me3$Var2)]) p1 <- ggplot(me3, aes(Var1, Var2, fill = `Pearson correlation`)) + geom_tile() + theme_classic() + scale_fill_distiller(palette = "RdYlBu") + scale_x_discrete(guide = guide_axis(angle = 45)) + xlab("Bulk-cell ChIP-seq") + ylab("Single-cell multiplexed NTT-seq") + ggtitle("H3K27me3") + theme(legend.position = "none") p2 <- ggplot(ac, aes(Var1, Var2, fill = `Pearson correlation`)) + geom_tile() + theme_classic() + scale_fill_distiller(palette = "RdYlBu") + scale_x_discrete(guide = guide_axis(angle = 45)) + xlab("Bulk-cell ChIP-seq") + ylab("Single-cell multiplexed NTT-seq") + ggtitle("H3K27ac") pp <- (p1 / p2) ggsave(filename = "plots/pbmc/encode_cor_spearman.png", plot = pp, height = 6, width = 5) # correlation between replicates mat <- read.table("data/pbmc_protein/replicate_cor.tsv", sep = "\t", header = TRUE, comment.char = "") mat <- mat[, 4:ncol(mat)] # remove region mat <- as.matrix(mat) mat[is.nan(mat)] <- 0 mat <- as.data.frame(mat) colnames(mat) <- c( "Rep. 1 H3K27me3", "Rep. 1 H3K27ac", "Rep. 2 H3K27me3", "Rep. 2 H3K27ac" ) mat <- mat[, c("Rep. 1 H3K27me3", "Rep. 2 H3K27me3", "Rep. 1 H3K27ac", "Rep. 2 H3K27ac")] all_cor <- cor(mat, method = "pearson") ac <- reshape2::melt(all_cor) ac$`Pearson correlation` <- ac$value p1 <- ggplot(ac, aes(Var1, Var2, fill = `Pearson correlation`)) + geom_tile() + theme_classic() + scale_fill_distiller(palette = "RdBu") + scale_x_discrete(guide = guide_axis(angle = 45)) + xlab("") + ylab("") ggsave("plots/pbmc/replicate_correlation.png", plot = p1, height = 3, width = 5) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 | library(Signac) library(Seurat) library(GenomicRanges) library(ggplot2) colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401") pbmc <- readRDS("objects/pbmc_protein.rds") pbmc_sc <- readRDS("objects/pbmc_sc.rds") henikoff_me3_filt <- readRDS("objects/henikoff/me3_filt.rds") henikoff_ac_filt <- readRDS("objects/henikoff/ac_filt.rds") # ENCODE peaks pk_ac <- read.table("data/encode/ENCFF832RWT.bed.gz") pk_me3 <- read.table("data/encode/ENCFF291LVP.bed.gz") pk_ac <- makeGRangesFromDataFrame(df = pk_ac, seqnames.field = "V1", start.field = "V2", end.field = "V3") pk_me3 <- makeGRangesFromDataFrame(df = pk_me3, seqnames.field = "V1", start.field = "V2", end.field = "V3") # total fragments for each dataset get_total <- function(frags, cells) { tot <- CountFragments(fragments = frags, cells = cells) totals <- tot$frequency_count names(totals) <- tot$CB return(totals) } # pbmc totals_pbmc_ac <- get_total(Fragments(pbmc[['ac']])[[1]]@path, colnames(pbmc)) totals_pbmc_me3 <- get_total(Fragments(pbmc[['me3']])[[1]]@path, cells = colnames(pbmc)) # new replicate totals_pbmc_ac_n <- get_total(Fragments(pbmc_sc[['ac']])[[1]]@path, cells = colnames(pbmc_sc)) totals_pbmc_me3_n <- get_total(Fragments(pbmc_sc[['me3']])[[1]]@path, cells = colnames(pbmc_sc)) # henikoff r1_cells_filt <- henikoff_me3_filt$orig.ident == '1' r2_cells_filt <- henikoff_me3_filt$orig.ident == '2' totals_h_me3_1_f <- get_total(Fragments(henikoff_me3_filt)[[1]]@path, cells = henikoff_me3_filt$barcode[r1_cells_filt]) totals_h_me3_2_f <- get_total(Fragments(henikoff_me3_filt)[[2]]@path, cells = henikoff_me3_filt$barcode[r2_cells_filt]) names(totals_h_me3_1_f) <- paste0(names(totals_h_me3_1_f), ":1") names(totals_h_me3_2_f) <- paste0(names(totals_h_me3_2_f), ":2") totals_h_me3_f <- c(totals_h_me3_1_f, totals_h_me3_2_f) totals_h_ac_f <- get_total(Fragments(henikoff_ac_filt)[[1]]@path, cells = colnames(henikoff_ac_filt)) # total counts in peak regions get_in_peaks <- function(obj, regions) { counts <- FeatureMatrix( fragments = Fragments(obj), features = regions, cells = colnames(obj) ) return(colSums(counts)) } # ntt in_peak_pbmc_ac_ac <- get_in_peaks(obj = pbmc[['ac']], regions = pk_ac) in_peak_pbmc_ac_me <- get_in_peaks(obj = pbmc[['ac']], regions = pk_me3) in_peak_pbmc_me_me <- get_in_peaks(obj = pbmc[['me3']], regions = pk_me3) in_peak_pbmc_me_ac <- get_in_peaks(obj = pbmc[['me3']], regions = pk_ac) # rep2 in_peak_pbmc_ac_ac_n <- get_in_peaks(obj = pbmc_sc[['ac']], regions = pk_ac) in_peak_pbmc_ac_me_n <- get_in_peaks(obj = pbmc_sc[['ac']], regions = pk_me3) in_peak_pbmc_me_me_n <- get_in_peaks(obj = pbmc_sc[['me3']], regions = pk_me3) in_peak_pbmc_me_ac_n <- get_in_peaks(obj = pbmc_sc[['me3']], regions = pk_ac) # henikoff filt in_peak_h_ac_ac_f <- get_in_peaks(obj = henikoff_ac_filt, regions = pk_ac) in_peak_h_ac_me_f <- get_in_peaks(obj = henikoff_ac_filt, regions = pk_me3) in_peak_h_me_me_f <- get_in_peaks(obj = henikoff_me3_filt, regions = pk_me3) in_peak_h_me_ac_f <- get_in_peaks(obj = henikoff_me3_filt, regions = pk_ac) # frip make_df <- function(totals, inpeak, assay, pk, dataset) { d <- data.frame( cell = names(totals), total = totals, in_peak = inpeak[names(totals)], assay = assay, peak = pk, dataset = dataset ) d$FRIP <- d$in_peak / d$total return(d) } df <- rbind( make_df(totals_pbmc_ac, in_peak_pbmc_ac_ac, "H3K27ac", "H3K27ac", "scNTT-seq"), make_df(totals_pbmc_ac, in_peak_pbmc_ac_me, "H3K27ac", "H3K27me3", "scNTT-seq"), make_df(totals_pbmc_me3, in_peak_pbmc_me_me, "H3K27me3", 'H3K27me3', "scNTT-seq"), make_df(totals_pbmc_me3, in_peak_pbmc_me_ac, "H3K27me3", "H3K27ac", "scNTT-seq"), make_df(totals_h_ac_f, in_peak_h_ac_ac_f, "H3K27ac", "H3K27ac", "scCUT&Tag"), make_df(totals_h_ac_f, in_peak_h_ac_me_f, "H3K27ac", "H3K27me3", "scCUT&Tag"), make_df(totals_h_me3_f, in_peak_h_me_me_f, "H3K27me3", "H3K27me3", "scCUT&Tag"), make_df(totals_h_me3_f, in_peak_h_me_ac_f, "H3K27me3", "H3K27ac", "scCUT&Tag") ) df2 <- rbind( make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_ac_n, "H3K27ac", "H3K27ac", "scNTT-seq"), make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_me_n, "H3K27ac", "H3K27me3", "scNTT-seq"), make_df(totals_pbmc_me3_n, in_peak_pbmc_me_me_n, "H3K27me3", 'H3K27me3', "scNTT-seq"), make_df(totals_pbmc_me3_n, in_peak_pbmc_me_ac_n, "H3K27me3", "H3K27ac", "scNTT-seq"), ) df_filt <- df[df$assay == df$peak, ] df_filt <- df_filt[!is.na(df_filt$FRIP), ] p <- ggplot(df_filt, aes(x = assay, y = FRIP, fill = dataset)) + geom_boxplot(outlier.size = 0.5) + xlab("Mark") + theme_bw() + ggtitle("FRiP") + theme(legend.position = 'none') ggsave("plots/pbmc/frip_pbmc.png", plot = p, height = 4, width = 2.5) p2 <- ggplot(df2, aes(x = assay, y = FRIP, fill = assay)) + geom_boxplot(outlier.size = 0.5) + scale_fill_manual(values = colormap) + xlab("Mark") + theme_bw() + ggtitle("FRiP") + theme(legend.position = 'none') ggsave("plots/pbmc/frip_pbmc_rep2.png", plot = p2, height = 4, width = 2.5) saveRDS(object = df_filt, file = "data/pbmc_protein/frip.rds") df1 <- df_filt[df_filt$dataset == "scNTT-seq", ] mean(df1[df1$peak == "H3K27ac", "FRIP"]) # 0.21 sd(df1[df1$peak == "H3K27ac", "FRIP"]) # 0.09 mean(df1[df1$peak == "H3K27me3", "FRIP"]) # 0.11 sd(df1[df1$peak == "H3K27me3", "FRIP"]) # 0.05 mean(df2[df2$peak == "H3K27ac", "FRIP"]) # 0.28 sd(df2[df2$peak == "H3K27ac", "FRIP"]) # 0.08 mean(df2[df2$peak == "H3K27me3", "FRIP"]) # 0.10 sd(df2[df2$peak == "H3K27me3", "FRIP"]) # 0.06 # revision df3 <- rbind( make_df(totals_pbmc_ac, in_peak_pbmc_ac_ac, "H3K27ac", "H3K27ac", "scNTT-seq (PBMC 1)"), make_df(totals_pbmc_ac, in_peak_pbmc_ac_me, "H3K27ac", "H3K27me3", "scNTT-seq (PBMC 1)"), make_df(totals_pbmc_me3, in_peak_pbmc_me_me, "H3K27me3", 'H3K27me3', "scNTT-seq (PBMC 1)"), make_df(totals_pbmc_me3, in_peak_pbmc_me_ac, "H3K27me3", "H3K27ac", "scNTT-seq (PBMC 1)"), make_df(totals_h_ac_f, in_peak_h_ac_ac_f, "H3K27ac", "H3K27ac", "scCUT&Tag (PBMC)"), make_df(totals_h_ac_f, in_peak_h_ac_me_f, "H3K27ac", "H3K27me3", "scCUT&Tag (PBMC)"), make_df(totals_h_me3_f, in_peak_h_me_me_f, "H3K27me3", "H3K27me3", "scCUT&Tag (PBMC)"), make_df(totals_h_me3_f, in_peak_h_me_ac_f, "H3K27me3", "H3K27ac", "scCUT&Tag (PBMC)"), make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_ac_n, "H3K27ac", "H3K27ac", "scNTT-seq (PBMC 2)"), make_df(totals_pbmc_ac_n, in_peak_pbmc_ac_me_n, "H3K27ac", "H3K27me3", "scNTT-seq (PBMC 2)"), make_df(totals_pbmc_me3_n, in_peak_pbmc_me_me_n, "H3K27me3", 'H3K27me3', "scNTT-seq (PBMC 2)"), make_df(totals_pbmc_me3_n, in_peak_pbmc_me_ac_n, "H3K27me3", "H3K27ac", "scNTT-seq (PBMC 2)") ) bmmc_frip <- readRDS(file = "data/bmmc_dual/frip.rds") bmmc_frip$dataset <- "scNTT-seq (BMMC)" df3 <- rbind(bmmc_frip, df3) df_filt <- df3[df3$assay == df3$peak, ] df_filt <- df_filt[!is.na(df_filt$FRIP), ] p <- ggplot(df_filt, aes(x = assay, y = FRIP, fill = dataset)) + geom_boxplot(outlier.size = 0.5) + xlab("Mark") + theme_bw() + ggtitle("Fragments in peaks") ggsave("plots/pbmc/frip_pbmc_revision.png", plot = p, height = 4, width = 6) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | library(Seurat) library(Signac) library(BSgenome.Hsapiens.UCSC.hg38) library(EnsDb.Hsapiens.v86) library(tximport) library(future) args <- commandArgs(trailingOnly = TRUE) threads <- as.numeric(args[[1]]) adt <- args[[2]] ac_frag <- args[[3]] me3_frag <- args[[4]] outfile <- args[[5]] plan("multicore", workers = as.numeric(threads)) options(future.globals.maxSize = Inf) # process chromatin data annot <- GetGRangesFromEnsDb(EnsDb.Hsapiens.v86) seqlevelsStyle(annot) <- "UCSC" total_me3 <- CountFragments(fragments = me3_frag) total_ac <- CountFragments(fragments = ac_frag) total_ac <- total_ac[total_ac$frequency_count > 100, ] total_me3 <- total_me3[total_me3$frequency_count > 100, ] cells_keep <- intersect(total_ac$CB, total_me3$CB) frag_obj_me3 <- CreateFragmentObject(path = me3_frag, cells = cells_keep) frag_obj_ac <- CreateFragmentObject(path = ac_frag, cells = cells_keep) chr_use <- seqlengths(BSgenome.Hsapiens.UCSC.hg38)[1:22] counts_me3 <- AggregateTiles( object = frag_obj_me3, genome = chr_use, cells = cells_keep, min_counts = 1 ) counts_ac <- AggregateTiles( object = frag_obj_ac, genome = chr_use, cells = cells_keep, min_counts = 1 ) # remove features overlapping blacklist remove_blacklist <- function(counts) { olap <- findOverlaps(query = StringToGRanges(rownames(counts)), subject = blacklist_hg38_unified) is_bl <- queryHits(olap) counts <- counts[setdiff(1:nrow(counts), is_bl), ] return(counts) } assay_me3 <- CreateChromatinAssay( counts = remove_blacklist(counts_me3), fragments = me3_frag, annotation = annot ) assay_ac <- CreateChromatinAssay( counts = remove_blacklist(counts_ac), fragments = ac_frag, annotation = annot ) obj <- CreateSeuratObject(counts = assay_me3, assay = "me3") obj[['ac']] <- assay_ac # add adt data adt.txi <- tximport(files = adt, type = "alevin") cells <- intersect(colnames(obj), colnames(adt.txi$counts)) obj <- obj[, cells] obj[['ADT']] <- CreateAssayObject(counts = adt.txi$counts[, cells]) obj <- subset(obj, subset = nCount_me3 < 40000 & nCount_ac < 10000 & nCount_me3 > 300 & nCount_ac > 100 & nCount_ADT < 10000 & nCount_ADT > 120) DefaultAssay(obj) <- "ADT" obj <- NormalizeData(obj, normalization.method = "CLR", margin = 2) igg_features <- rownames(obj)[grepl("^Ig", x = rownames(obj))] var_feat <- setdiff(rownames(obj), igg_features) VariableFeatures(obj) <- var_feat obj <- ScaleData(obj) obj <- RunPCA(obj) obj <- FindNeighbors(obj, reduction = "pca", dims = 1:40) obj <- FindClusters(obj) obj <- RunUMAP(obj, reduction = 'pca', reduction.name = "umap.adt", dims = 1:40) # remove artifact clusters obj <- subset(obj, idents = c(4, 8), invert = TRUE) obj <- NormalizeData(obj, normalization.method = "CLR", margin = 2) obj <- ScaleData(obj) obj <- RunPCA(obj) obj <- FindNeighbors(obj, reduction = "pca", dims = 1:40) obj <- FindClusters(obj) obj <- RunUMAP(obj, reduction = 'pca', reduction.name = "umap.adt", dims = 1:40) DefaultAssay(obj) <- "me3" feat.keep <- names(which(rowSums(obj, slot = 'counts') > 1)) # remove low-count features obj[['me3']] <- subset(obj[['me3']], features = feat.keep) obj <- FindTopFeatures(obj) obj <- RunTFIDF(obj) obj <- RunSVD(obj, scale.embeddings = TRUE, reduction.name = 'lsi.me3') obj <- RunUMAP(obj, reduction = 'lsi.me3', dims = 2:30, reduction.name = 'umap.me3') DefaultAssay(obj) <- "ac" feat.keep <- names(which(rowSums(obj, slot = 'counts') > 1)) # remove low-count features obj[['ac']] <- subset(obj[['ac']], features = feat.keep) obj <- FindTopFeatures(obj) obj <- RunTFIDF(obj) obj <- RunSVD(obj, scale.embeddings = TRUE, reduction.name = 'lsi.ac') obj <- RunUMAP(obj, reduction = 'lsi.ac', dims = 2:30, reduction.name = 'umap.ac') obj <- FindMultiModalNeighbors( object = obj, reduction.list = list("lsi.me3", "lsi.ac", "pca"), dims.list = list(2:30, 2:30, 1:30) ) obj <- RunUMAP(obj, nn.name = "weighted.nn", reduction.name = "umap.wnn") obj <- FindClusters(obj, graph.name = "wsnn", algorithm = 3, resolution = 1) Idents(obj) <- "seurat_clusters" obj <- RenameIdents(obj, list("0" = "CD14+ Mono", "1" = "CD14+ Mono", "2" = "B cell", "3" = "CD14+ Mono", "4" = "CD4 T cell", "5" = "CD16+ Mono", "6" = "Late erythroid", "7" = "CD8 T cell", "8" = "B cell", "9" = "NK", "10" = "cDC" ) ) obj$celltype <- Idents(obj) obj <- TSSEnrichment(obj, fast = FALSE, assay = "me3") obj <- TSSEnrichment(obj, fast = FALSE, assay = "ac") saveRDS(object = obj, file = outfile) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | library(Seurat) library(Signac) library(Rsamtools) library(BSgenome.Hsapiens.UCSC.hg38) library(ggplot2) library(patchwork) args <- commandArgs(trailingOnly = TRUE) ac_peaks <- args[[1]] me3_peaks <- args[[2]] colormap <- list("H3K27me3" = "#D3145A", "H3K27ac" = "#F98401", "RNAPII" = "#036C9A") regions_ac <- read.table(ac_peaks, sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE) regions_ac$peak <- "H3K27ac" regions_me3 <- read.table(me3_peaks, sep = "\t", col.names = c("chromosome", "start", "end", "name", "score", "strand", "a", "b" , "c", "d")) regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE) regions_me3$peak <- "H3K27me3" # regions_me3 <- read.table("data/pbmc_bulk/mapped/me3_peaks.broadPeak", sep = "\t", # col.names = c("chromosome", "start", "end", "name", "score", # "strand", "a", "b" , "c")) # regions_me3 <- makeGRangesFromDataFrame(regions_me3, keep.extra.columns = TRUE) # regions_me3$peak <- "H3K27me3" # regions_me3 <- keepStandardChromosomes(regions_me3, pruning.mode = "coarse") # regions_me3 <- dropSeqlevels(regions_me3, value = "chrM", pruning.mode = "coarse") # # regions_ac <- read.table("data/pbmc_bulk/mapped/ac_peaks.narrowPeak", sep = "\t", # col.names = c("chromosome", "start", "end", "name", "score", # "strand", "a", "b" , "c", "d")) # regions_ac <- makeGRangesFromDataFrame(regions_ac, keep.extra.columns = TRUE) # regions_ac$peak <- "H3K27ac" # regions_ac <- keepStandardChromosomes(regions_ac, pruning.mode = "coarse") # regions_ac <- dropSeqlevels(regions_ac, value = "chrM", pruning.mode = "coarse") regions <- c(regions_me3, regions_ac) QuantifyRegions <- function(fragfile, regions) { results <- vector(mode = "numeric", length = length(regions)) # open tabix connection tabix.file <- TabixFile(file = fragfile) open(con = tabix.file) # get fragments in each region for (j in seq_along(regions)) { reads <- scanTabix(file = tabix.file, param = regions[j]) results[j] <- sum(sapply(X = reads, FUN = length)) } # close connection close(con = tabix.file) cmp <- (results / sum(results)) * 10^6 return(cmp) } # get filtered fragment files # single-cell frags_me3 <- "data/pbmc_protein/bigwig/pbmc_me3.bed.gz" frags_ac <- "data/pbmc_protein/bigwig/pbmc_ac.bed.gz" # bulk_cell bulk_me3 <- "data/pbmc_bulk/mapped/mono/me3.bed.gz" bulk_ac <- "data/pbmc_bulk/mapped/mono/ac.bed.gz" # quantify ac_sc_cov <- QuantifyRegions(frags_ac, regions) me_sc_cov <- QuantifyRegions(frags_me3, regions) me_bulk_cov <- QuantifyRegions(bulk_me3, regions) ac_bulk_cov <- QuantifyRegions(bulk_ac, regions) # create matrix mat <- cbind(ac_sc_cov, me_sc_cov, ac_bulk_cov, me_bulk_cov) mat <- as.data.frame(mat) mat$region <- regions$peak # randomize order mat <- mat[sample(nrow(mat)), ] get_cod_expression <- function(m) { eq <- substitute(~~italic(R)^2~"="~r2, list(r2 = format(summary(m)$r.squared, digits = 2))) cod <- as.expression(eq) return(cod) } ac_me3_sc <- ggplot(mat, aes(x = ac_sc_cov, y = me_sc_cov, color = region)) + geom_point(size = 0.1) + theme_classic() + ylim(c(0, 600)) + xlim(c(0, 600)) + xlab("") + ylab("") + ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = mat))) + scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) + geom_abline(intercept = 0, slope = 1, linetype = 2) + theme(legend.position = "none") ac_sc_ac_bulk <- ggplot(mat[mat$region=="H3K27ac", ], aes(x = ac_sc_cov, y = ac_bulk_cov, color = region)) + geom_point(size = 0.1) + theme_classic() + xlab("H3K27ac (single-cell)") + ylab("H3K27ac (bulk)") + ggtitle(get_cod_expression(lm(ac_sc_cov ~ ac_bulk_cov, data = mat))) + scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) + theme(legend.position = "none") ggsave(filename = "plots/pbmc/scatterplot_pbmc.png", plot = ac_me3_sc, height = 3.5, width = 3.5) ## BMMC frags_me3_bmmc <- "data/bmmc_dual/bulk_fragments/me3.bed.gz" frags_ac_bmmc <- "data/bmmc_dual/bulk_fragments/ac.bed.gz" # quantify me_sc_cov <- QuantifyRegions(frags_me3_bmmc, regions) ac_sc_cov <- QuantifyRegions(frags_ac_bmmc, regions) # create matrix mat_bmmc <- cbind(ac_sc_cov, me_sc_cov) mat_bmmc <- as.data.frame(mat_bmmc) mat_bmmc$region <- regions$peak mat_bmmc <- mat_bmmc[sample(nrow(mat_bmmc)), ] ac_me3_sc_bmmc <- ggplot(mat_bmmc, aes(x = ac_sc_cov, y = me_sc_cov, color = region)) + geom_point(size = 0.1) + theme_classic() + ylim(c(0, 600)) + xlim(c(0, 600)) + xlab("H3K27ac (multiplexed single-cell)") + scale_color_manual(values = c(colormap$H3K27ac, colormap$H3K27me3)) + ylab("H3K27me3 (multiplexed single-cell)") + ggtitle(get_cod_expression(lm(ac_sc_cov ~ me_sc_cov, data = mat_bmmc))) + theme(legend.position = "none") + geom_abline(intercept = 0, slope = 1, linetype = 2) ggsave(filename = "plots/bmmc/scatterplot_bmmc.png", plot = ac_me3_sc_bmmc, height = 4, width = 4) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | library(Signac) library(Seurat) obj <- readRDS("objects/pbmc_protein.rds") Idents(obj) <- "celltype" outdir <- "data/pbmc_protein/bigwig/" celltypes.split <- levels(obj) # split fragments for each assay DefaultAssay(obj) <- "ac" frags <- Fragments(obj)[[1]]@path for (i in celltypes.split) { ct_safe <- gsub(" ", "_", i, fixed = TRUE) ct_safe <- gsub("+", "", ct_safe, fixed = TRUE) message(ct_safe) FilterCells( fragments = frags, cells = WhichCells(object = obj, idents = i), outfile = paste0(outdir, ct_safe, "_ac.bed.gz"), verbose = TRUE ) } DefaultAssay(obj) <- "me3" frags <- Fragments(obj)[[1]]@path for (i in celltypes.split) { ct_safe <- gsub(" ", "_", i, fixed = TRUE) ct_safe <- gsub("+", "", ct_safe, fixed = TRUE) message(ct_safe) FilterCells( fragments = frags, cells = WhichCells(object = obj, idents = i), outfile = paste0(outdir, ct_safe, "_me3.bed.gz"), verbose = TRUE ) } # remove non-cell fragments for each assay for genome bin quant DefaultAssay(obj) <- "ac" frags <- Fragments(obj)[[1]]@path FilterCells( fragments = frags, cells = colnames(obj), outfile = paste0(outdir, "pbmc_ac.bed.gz"), verbose = TRUE ) DefaultAssay(obj) <- "me3" frags <- Fragments(obj)[[1]]@path FilterCells( fragments = frags, cells = colnames(obj), outfile = paste0(outdir, "pbmc_me3.bed.gz"), verbose = TRUE ) |
21 22 23 24 25 | shell: """ cd data wget https://hgdownload-test.gi.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes """ |
33 34 35 36 37 38 | shell: """ cd genome/hg38 aws s3 sync s3://stuart-genomes/hg38_analysis/bwa-mem2/ . aws s3 cp s3://stuart-genomes/hg38_analysis/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz . """ |
47 48 49 50 | shell: """ wget -i {input} -P data/ct_pro """ |
57 58 59 60 | shell: """ wget -i {input} -P data/mesc """ |
70 71 72 73 74 75 76 77 78 79 80 | shell: """ wget -i {input} -P data/henikoff wget https://github.com/Henikoff/scCUT-Tag/files/8647406/meta.csv -P data/henikoff wget https://github.com/Henikoff/scCUT-Tag/files/8653804/K27Ac_pbmc_meta.csv -P data/henikoff cd data/henikoff tabix -p bed GSM5034342_K27me3_R1_PBMC.fragments.HG38.tsv.gz tabix -p bed GSM5034343_K27me3_R2_PBMC.fragments.HG38.tsv.gz tabix -p bed GSM5034344_K27ac_PBMC.fragments.HG38.tsv.gz """ |
91 92 93 94 | shell: """ Rscript code/CT_pro/update_objects.R """ |
100 101 102 103 104 | shell: """ cd data/multict/fragments aws s3 sync s3://multi-cuttag/fragments/ . --request-payer """ |
109 110 111 112 113 114 115 | shell: """ cd data/K562_ATAC wget https://www.encodeproject.org/files/ENCFF006OFA/@@download/ENCFF006OFA.bigBed wget https://www.encodeproject.org/files/ENCFF600FDO/@@download/ENCFF600FDO.bigWig wget https://www.encodeproject.org/files/ENCFF558BLC/@@download/ENCFF558BLC.bed.gz """ |
122 123 124 125 126 | shell: """ wget -i {input} -P data/bmmc_atac aws s3 sync s3://mpal-hg38/public/ ./data/bmmc_atac/ --request-payer """ |
131 132 133 134 | shell: """ aws s3 sync s3://bmmc-reference/public objects/ --request-payer """ |
144 145 146 147 | shell: """ wget -i {input} -P data/encode """ |
157 158 159 160 | shell: """ Rscript code/encode/combine_peaks.R """ |
166 167 168 169 170 171 172 173 | shell: """ cd data/pbmc_atac wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_fragments.tsv.gz wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_fragments.tsv.gz.tbi wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_singlecell.csv wget https://cf.10xgenomics.com/samples/cell-atac/2.1.0/10k_pbmc_ATACv2_nextgem_Chromium_X/10k_pbmc_ATACv2_nextgem_Chromium_X_filtered_peak_bc_matrix.h5 """ |
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 | shell: """ bwa-mem2 mem {input.genome} \ -t {threads} \ {input.r1} \ {input.r2} \ | samtools sort -@ {threads} -O bam - \ > data/HEK_K562_sc/mapped/{wildcards.mark}.bam samtools index data/HEK_K562_sc/mapped/{wildcards.mark}.bam sinto fragments \ -b data/HEK_K562_sc/mapped/{wildcards.mark}.bam \ -p {threads} \ -f data/HEK_K562_sc/sinto/{wildcards.mark}.tmp \ --barcode_regex "[^:]*" sort -k1,1 -k2,2n data/HEK_K562_sc/sinto/{wildcards.mark}.tmp > data/HEK_K562_sc/sinto/{wildcards.mark}.bed bgzip -@ {threads} data/HEK_K562_sc/sinto/{wildcards.mark}.bed tabix -p bed {output.frags} rm data/HEK_K562_sc/sinto/{wildcards.mark}.tmp """ |
217 218 219 220 221 222 223 224 225 226 | shell: """ bwa-mem2 mem {input.genome} \ -t {threads} \ {input.r1} \ {input.r2} \ | samtools sort -@ {threads} -O bam - \ > data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bam samtools index data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bam """ |
232 233 234 235 | shell: """ bamCoverage -b {input} -o {output} -p {threads} --normalizeUsing BPM """ |
241 242 243 244 245 246 247 248 249 250 251 252 253 | shell: """ sinto fragments -p {threads} \ -b {input} \ --barcode_regex "[^:]*" \ -f data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag sort -k1,1 -k2,2n data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag \ > data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bed bgzip data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.bed tabix -p bed {output} rm data/HEK_K562_bulk/mapped/{wildcards.cell}-{wildcards.plex}-{wildcards.mark}.frag """ |
265 266 267 268 | shell: """ Rscript code/HEK_K562_sc/process.R {threads} {output.obj} data/HEK_K562_sc/sinto """ |
288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | shell: """ # me3 bwa-mem2 mem {input.genome} \ -t {threads} \ {input.me3_r1_mono} \ {input.me3_r2_mono} \ | samtools sort -@ {threads} -O bam - \ > data/pbmc_bulk/mapped/mono/me3.bam samtools index data/pbmc_bulk/mapped/mono/me3.bam bwa-mem2 mem {input.genome} \ -t {threads} \ {input.me3_r1_plex} \ {input.me3_r2_plex} \ | samtools sort -@ {threads} -O bam - \ > data/pbmc_bulk/mapped/plex/me3.bam samtools index data/pbmc_bulk/mapped/plex/me3.bam # ac bwa-mem2 mem {input.genome} \ -t {threads} \ {input.ac_r1_mono} \ {input.ac_r2_mono} \ | samtools sort -@ {threads} -O bam - \ > data/pbmc_bulk/mapped/mono/ac.bam samtools index data/pbmc_bulk/mapped/mono/ac.bam bwa-mem2 mem {input.genome} \ -t {threads} \ {input.ac_r1_plex} \ {input.ac_r2_plex} \ | samtools sort -@ {threads} -O bam - \ > data/pbmc_bulk/mapped/plex/ac.bam samtools index data/pbmc_bulk/mapped/plex/ac.bam """ |
329 330 331 332 | shell: """ bamCoverage -b {input} -o {output} -p {threads} --normalizeUsing BPM """ |
338 339 340 341 342 343 344 345 346 347 348 349 350 | shell: """ sinto fragments -p {threads} \ -b {input} \ --barcode_regex "[^:]*" \ -f data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag sort -k1,1 -k2,2n data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag \ > data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.bed bgzip data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.bed tabix -p bed {output} rm data/pbmc_bulk/mapped/{wildcards.plex}/{wildcards.mark}.frag """ |
363 364 365 366 367 368 369 370 371 372 | shell: """ bedtools genomecov -i {input.me3} -g {input.chrom} -bg > data/ct_pro/H3K27me3.tmp sort -k1,1 -k2,2n data/ct_pro/H3K27me3.tmp > {output.me3_bg} bedGraphToBigWig {output.me3_bg} {input.chrom} {output.me3_bw} bedtools genomecov -i {input.ac} -g {input.chrom} -bg > data/ct_pro/H3K27ac.tmp sort -k1,1 -k2,2n data/ct_pro/H3K27ac.tmp > {output.ac_bg} bedGraphToBigWig {output.ac_bg} {input.chrom} {output.ac_bw} """ |
382 383 384 385 | shell: """ wget -i {input} -P data/k562_peaks """ |
393 394 395 396 | shell: """ Rscript code/HEK_K562_bulk/collect_peaks.R """ |
411 412 413 414 415 416 417 418 419 | shell: """ python code/demux.py \ --read1 {input.read1} \ --read_i5 {input.readi5} \ --read2 {input.read2} \ --tn5 {input.bc} \ --output {params.outdir} """ |
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 | shell: """ bwa-mem2 mem {input.genome} \ -t {threads} \ data/{wildcards.exp}/ntt/out/{wildcards.mk}.R1.fastq \ data/{wildcards.exp}/ntt/out/{wildcards.mk}.R2.fastq \ | samtools sort -@ {threads} -O bam - \ > data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam samtools index data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam sinto fragments -p {threads} \ -b data/{wildcards.exp}/ntt/out/{wildcards.mk}.bam \ --barcode_regex "[^:]*" \ -f data/{wildcards.exp}/ntt/{wildcards.mk}.frag sort -k1,1 -k2,2n data/{wildcards.exp}/ntt/{wildcards.mk}.frag > data/{wildcards.exp}/ntt/{wildcards.mk}.tsv rm data/{wildcards.exp}/ntt/{wildcards.mk}.frag bgzip -@ {threads} data/{wildcards.exp}/ntt/{wildcards.mk}.tsv tabix -p bed {output} """ |
456 457 458 459 | shell: """ salmon index -t {input} -i {output} --features -k7 """ |
468 469 470 471 472 473 474 475 | shell: """ paste <(gunzip -c {input.r1}) <(gunzip -c {input.r3}) \ | paste - - - - \ | awk -F'\\t' -v one="data/pbmc_protein/adt/read1.fastq" '{{ OFS="\\n"; print $1,$3$4,$5,$7$8 >> one;}}' pigz -p {threads} data/pbmc_protein/adt/read1.fastq """ |
484 485 486 487 488 489 490 491 492 493 494 495 496 497 | shell: """ salmon alevin -l ISR -i {input.index} \ -o data/pbmc_protein/adt/outs \ -p {threads} \ --naiveEqclass \ --keepCBFraction 0.8 \ -1 data/pbmc_protein/adt/TAG_S2_R2_001.fastq.gz \ -2 {input.reads} \ --bc-geometry 1[1-16] \ --umi-geometry 2[1-10] \ --read-geometry 2[71-85] \ --tgMap data/adt_map_totalseq_a.tsv """ |
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 | shell: """ cd data/bmmc_dual/DNA cutadapt -j {threads} -u 14 -o K27ac_R2.fastq scBM_k27ac_S2_R2_001.fastq.gz cutadapt -j {threads} -u 14 -o K27me_R2.fastq scBM_k27me_S1_R2_001.fastq.gz gzip -d scBM_k27ac_S2_R1_001.fastq.gz scBM_k27ac_S2_R3_001.fastq.gz gzip -d scBM_k27me_S1_R1_001.fastq.gz scBM_k27me_S1_R3_001.fastq.gz sinto barcode \ --barcode_fastq K27ac_R2.fastq \ --read1 scBM_k27ac_S2_R1_001.fastq \ --read2 scBM_k27ac_S2_R3_001.fastq \ -b 16 sinto barcode \ --barcode_fastq K27me_R2.fastq \ --read1 scBM_k27me_S1_R1_001.fastq \ --read2 scBM_k27me_S1_R3_001.fastq \ -b 16 pigz -p {threads} *.fastq mkdir barcoded mv scBM_k27me_S1_R1_001.barcoded.fastq.gz scBM_k27me_S1_R3_001.barcoded.fastq.gz ./barcoded/ mv scBM_k27ac_S2_R1_001.barcoded.fastq.gz scBM_k27ac_S2_R3_001.barcoded.fastq.gz ./barcoded/ """ |
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 | shell: """ bwa-mem2 mem {input.genome} \ -t {threads} \ data/bmmc_dual/DNA/barcoded/scBM_k27me_S1_R1_001.barcoded.fastq.gz \ data/bmmc_dual/DNA/barcoded/scBM_k27me_S1_R3_001.barcoded.fastq.gz \ | samtools sort -@ {threads} -O bam - \ > data/bmmc_dual/DNA/H3K27me3.bam bwa-mem2 mem {input.genome} \ -t {threads} \ data/bmmc_dual/DNA/barcoded/scBM_k27ac_S2_R1_001.barcoded.fastq.gz \ data/bmmc_dual/DNA/barcoded/scBM_k27ac_S2_R3_001.barcoded.fastq.gz \ | samtools sort -@ {threads} -O bam - \ > data/bmmc_dual/DNA/H3K27ac.bam samtools index data/bmmc_dual/DNA/H3K27ac.bam samtools index data/bmmc_dual/DNA/H3K27me3.bam sinto fragments -p {threads} \ -b data/bmmc_dual/DNA/H3K27ac.bam \ --barcode_regex "[^:]*" \ -f data/bmmc_dual/DNA/H3K27ac.frag sinto fragments -p {threads} \ -b data/bmmc_dual/DNA/H3K27me3.bam \ --barcode_regex "[^:]*" \ -f data/bmmc_dual/DNA/H3K27me3.frag sort -k1,1 -k2,2n data/bmmc_dual/DNA/H3K27ac.frag > data/bmmc_dual/DNA/H3K27ac.bed sort -k1,1 -k2,2n data/bmmc_dual/DNA/H3K27me3.frag > data/bmmc_dual/DNA/H3K27me3.bed rm data/bmmc_dual/DNA/H3K27ac.frag data/bmmc_dual/DNA/H3K27me3.frag bgzip -@ {threads} data/bmmc_dual/DNA/H3K27ac.bed bgzip -@ {threads} data/bmmc_dual/DNA/H3K27me3.bed tabix -p bed {output.ac} tabix -p bed {output.me} """ |
584 585 586 587 | shell: """ Rscript code/bmmc_atac/process_bmmc_atac.r """ |
598 599 600 601 | shell: """ Rscript code/henikoff/process.R """ |
616 617 618 619 | shell: """ Rscript code/HEK_K562_bulk/analysis.R """ |
637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 | shell: """ gzip -dc {input.me3_peaks} > data/k562_peaks/ENCFF031FSF.bed gzip -dc {input.ac_peaks} > data/k562_peaks/ENCFF038DDS.bed computeMatrix reference-point \ --referencePoint 'center' \ --missingDataAsZero \ -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \ {input.mono_ac} {input.mono_me3} {input.mono_rna} \ -R data/k562_peaks/ENCFF049HUP.bed \ --upstream 5000 --downstream 5000 \ -p {threads} \ -o data/HEK_K562_bulk/h3k27me3.mat.gz computeMatrix reference-point \ --referencePoint 'center' \ --missingDataAsZero \ -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \ {input.mono_ac} {input.mono_me3} {input.mono_rna} \ -R data/k562_peaks/ENCFF038DDS.bed \ --upstream 5000 --downstream 5000 \ -p {threads} \ -o data/HEK_K562_bulk/h3k27ac.mat.gz computeMatrix reference-point \ --referencePoint 'center' \ --missingDataAsZero \ -S {input.plex_ac} {input.plex_me3} {input.plex_rna} \ {input.mono_ac} {input.mono_me3} {input.mono_rna} \ -R {input.rna_peaks} \ --upstream 5000 --downstream 5000 \ -p {threads} \ -o data/HEK_K562_bulk/rna.mat.gz plotHeatmap \ -m data/HEK_K562_bulk/h3k27me3.mat.gz \ -o {output.me3} \ --whatToShow 'heatmap and colorbar' \ --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767' plotHeatmap \ -m data/HEK_K562_bulk/h3k27ac.mat.gz \ -o {output.ac} \ --zMax 0.5 \ --whatToShow 'heatmap and colorbar' \ --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767' plotHeatmap \ -m data/HEK_K562_bulk/rna.mat.gz \ -o {output.rna} \ --zMax 0.5 \ --whatToShow 'heatmap and colorbar' \ --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#036C9A' '#FFFFFF,#676767' '#FFFFFF,#676767' '#FFFFFF,#676767' """ |
697 698 699 700 | shell: """ Rscript code/HEK_K562_sc/analysis.R """ |
706 707 708 709 | shell: """ Rscript code/HEK/K562_bulk/quantify_regions.R """ |
719 720 721 722 | shell: """ Rscript code/HEK_K562_sc/mct_scatter.R """ |
730 731 732 733 | shell: """ Rscript code/pbmc_atac/process.R """ |
742 743 744 745 | shell: """ Rscript code/pbmc_protein/process.R {threads} {input.adt} {input.ac} {input.me} {output} """ |
753 754 755 756 | shell: """ Rscript code/pbmc_sc/process.R {threads} {input.ac} {input.me} {output} """ |
770 771 772 773 774 775 776 777 778 779 780 781 782 | shell: """ # split fragment files Rscript code/pbmc_protein/split.R # create bigwig cd data/pbmc_protein/bigwig for fragfile in $(ls -d *.bed.gz);do fname=(${{fragfile//.bed.gz/ }}) bedtools genomecov -i $fragfile -g ../../../{input.chrom} -bg > "${{fname}}.bg" bedGraphToBigWig "${{fname}}.bg" ../../../{input.chrom} "${{fname}}.bw" done """ |
795 796 797 798 799 800 801 802 803 804 805 806 807 | shell: """ # split fragment files Rscript code/pbmc_sc/split.R # create bigwig cd data/pbmc_sc/ntt for fragfile in $(ls -d *.bed.gz);do fname=(${{fragfile//.bed.gz/ }}) bedtools genomecov -i $fragfile -g ../../../{input.chrom} -bg > "${{fname}}.bg" bedGraphToBigWig "${{fname}}.bg" ../../../{input.chrom} "${{fname}}.bw" done """ |
814 815 816 817 | shell: """ Rscript code/bmmc_dual/split.R """ |
834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 | shell: """ multiBigwigSummary BED-file -b \ {input.me3_sc} \ {input.ac_sc} \ {input.me3_bulk_mono} \ {input.me3_bulk_plex} \ {input.ac_bulk_mono} \ {input.ac_bulk_plex} \ {input.me3_chip} \ {input.ac_chip} \ {input.ctp_ac_bw} \ {input.ctp_me3_bw} \ --BED {input.all_peaks} \ -p {threads} \ --outRawCounts {output} \ -o data/pbmc_bulk/cor_matrix_all.npz """ |
862 863 864 865 866 867 868 869 870 871 872 873 | shell: """ multiBigwigSummary BED-file -b \ {input.me3_1} \ {input.ac_1} \ {input.me3_2} \ {input.ac_2} \ --BED {input.all_peaks} \ -p {threads} \ --outRawCounts {output} \ -o data/pbmc_protein/replicate_cor_matrix_all.npz """ |
879 880 881 882 | shell: """ Rscript code/pbmc_bulk/encode_cor_all.R """ |
896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 | shell: """ # ac cor multiBigwigSummary BED-file -b \ data/pbmc_protein/bigwig/B_cell_ac.bw \ data/pbmc_protein/bigwig/NK_ac.bw \ data/pbmc_protein/bigwig/CD14_Mono_ac.bw \ data/pbmc_protein/bigwig/CD8_T_cell_ac.bw \ data/pbmc_protein/bigwig/CD4_T_cell_ac.bw \ data/pbmc_protein/bigwig/Late_erythroid_ac.bw \ data/encode/ENCFF293ETP.bigWig \ data/encode/ENCFF611GRL.bigWig \ data/encode/ENCFF181OXO.bigWig \ data/encode/ENCFF526VJO.bigWig \ data/encode/ENCFF601NLG.bigWig \ data/encode/ENCFF951ZBV.bigWig \ data/encode/ENCFF064JOI.bigWig \ --BED {input.ac} \ -p {threads} \ --outRawCounts data/pbmc_protein/bigwig/raw_ac.tsv \ -o data/pbmc_protein/bigwig/cor_matrix_ac.npz # me3 cor multiBigwigSummary BED-file -b \ data/pbmc_protein/bigwig/B_cell_me3.bw \ data/pbmc_protein/bigwig/NK_me3.bw \ data/pbmc_protein/bigwig/CD14_Mono_me3.bw \ data/pbmc_protein/bigwig/CD8_T_cell_me3.bw \ data/pbmc_protein/bigwig/CD4_T_cell_me3.bw \ data/pbmc_protein/bigwig/Late_erythroid_me3.bw \ data/encode/ENCFF569IPX.bigWig \ data/encode/ENCFF842JLZ.bigWig \ data/encode/ENCFF811VAX.bigWig \ data/encode/ENCFF499VWN.bigWig \ data/encode/ENCFF777EGG.bigWig \ data/encode/ENCFF046VLL.bigWig \ data/encode/ENCFF085YMB.bigWig \ --BED {input.me3} \ -p {threads} \ --outRawCounts data/pbmc_protein/bigwig/raw_me3.tsv \ -o data/pbmc_protein/bigwig/cor_matrix_me3.npz # all cor multiBigwigSummary BED-file -b \ data/pbmc_protein/bigwig/B_cell_me3.bw \ data/pbmc_protein/bigwig/NK_me3.bw \ data/pbmc_protein/bigwig/CD14_Mono_me3.bw \ data/pbmc_protein/bigwig/CD8_T_cell_me3.bw \ data/pbmc_protein/bigwig/CD4_T_cell_me3.bw \ data/pbmc_protein/bigwig/Late_erythroid_me3.bw \ data/pbmc_protein/bigwig/B_cell_ac.bw \ data/pbmc_protein/bigwig/NK_ac.bw \ data/pbmc_protein/bigwig/CD14_Mono_ac.bw \ data/pbmc_protein/bigwig/CD8_T_cell_ac.bw \ data/pbmc_protein/bigwig/CD4_T_cell_ac.bw \ data/pbmc_protein/bigwig/Late_erythroid_ac.bw \ data/encode/ENCFF569IPX.bigWig \ data/encode/ENCFF842JLZ.bigWig \ data/encode/ENCFF811VAX.bigWig \ data/encode/ENCFF499VWN.bigWig \ data/encode/ENCFF777EGG.bigWig \ data/encode/ENCFF046VLL.bigWig \ data/encode/ENCFF085YMB.bigWig \ data/encode/ENCFF293ETP.bigWig \ data/encode/ENCFF611GRL.bigWig \ data/encode/ENCFF181OXO.bigWig \ data/encode/ENCFF526VJO.bigWig \ data/encode/ENCFF601NLG.bigWig \ data/encode/ENCFF951ZBV.bigWig \ data/encode/ENCFF064JOI.bigWig \ --BED data/encode/all.bed \ -p {threads} \ --outRawCounts data/pbmc_protein/bigwig/raw.tsv \ -o data/pbmc_protein/bigwig/cor_matrix_all.npz """ |
982 983 984 985 | shell: """ Rscript code/pbmc_protein/encode_cor.R """ |
994 995 996 997 | shell: """ Rscript code/pbmc_bulk/analysis.R """ |
1008 1009 1010 1011 | shell: """ Rscript code/pbmc_bulk/frip.R """ |
1024 1025 1026 1027 | shell: """ Rscript code/pbmc_protein/frip.R """ |
1035 1036 1037 1038 | shell: """ Rscript code/bmmc_dual/frip_bmmc.R """ |
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 | shell: """ gzip -dc {input.me3_peaks} > data/encode/ENCFF291LVP.bed gzip -dc {input.ac_peaks} > data/encode/ENCFF832RWT.bed computeMatrix reference-point \ --referencePoint 'center' \ --missingDataAsZero \ -S {input.plex_ac} {input.plex_me3} {input.mono_ac} {input.mono_me3} \ -R data/encode/ENCFF291LVP.bed \ --upstream 5000 --downstream 5000 \ -p {threads} \ -o data/pbmc_bulk/h3k27me3.mat.gz computeMatrix reference-point \ --referencePoint 'center' \ --missingDataAsZero \ -S {input.plex_ac} {input.plex_me3} {input.mono_ac} {input.mono_me3} \ -R data/encode/ENCFF832RWT.bed \ --upstream 5000 --downstream 5000 \ -p {threads} \ -o data/pbmc_bulk/h3k27ac.mat.gz plotHeatmap \ -m data/pbmc_bulk/h3k27me3.mat.gz \ -o {output.me3} \ --whatToShow 'heatmap only' \ --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#676767' '#FFFFFF,#676767' plotHeatmap \ -m data/pbmc_bulk/h3k27ac.mat.gz \ -o {output.ac} \ --whatToShow 'heatmap only' \ --colorList '#FFFFFF,#F98401' '#FFFFFF,#D3145A' '#FFFFFF,#676767' '#FFFFFF,#676767' """ |
1100 1101 1102 1103 | shell: """ Rscript code/pbmc_protein/scatterplots.R {input.ac_peaks} {input.me3_peaks} """ |
1115 1116 1117 1118 | shell: """ Rscript code/pbmc_sc/scatterplots_sc.R {input.ac_peaks} {input.me3_peaks} """ |
1126 1127 1128 1129 | shell: """ Rscript code/bmmc_dual/process.R {threads} {output} """ |
1139 1140 1141 1142 | shell: """ Rscript code/bmmc_dual/trajectory.R """ |
1147 1148 1149 1150 | shell: """ Rscript code/bmmc_dual/transfer.R """ |
1155 1156 1157 1158 | shell: """ Rscript code/bmmc_dual/analysis.R """ |
1169 1170 1171 1172 | shell: """ Rscript code/pbmc_protein/analysis.R {input.ntt} {input.ct_ac} {input.ct_me3} {input.ntt_2} """ |
1184 1185 1186 1187 | shell: """ Rscript code/pbmc_bulk/quantify_regions.R """ |
Support
- Future updates