BioWorkflows: Analysis code for the TAP-seq manuscript.

shell:
  "picard FastqToSam "
  "FASTQ={input.fastq1} "
  "FASTQ2={input.fastq2} "
  "OUTPUT={output} "
  "SAMPLE_NAME={wildcards.sample} "
  "SORT_ORDER=queryname "
  "2> {log}"

SnakeMake Picard From line 16 of rules/align_reads.smk

shell:
  "TagBamWithReadSequenceExtended "
  "INPUT={input} "
  "OUTPUT={output.bam} "
  "SUMMARY={output.summary} "
  "BASE_QUALITY={params.base_qual} "
  "NUM_BASES_BELOW_QUALITY={params.bases_below_qual} "
  "BASE_RANGE={params.base_range} "
  "BARCODED_READ=1 "
  "DISCARD_READ=false "
  "TAG_NAME=XC "
  "2> {log}"

SnakeMake From line 38 of rules/align_reads.smk

shell:
  "TagBamWithReadSequenceExtended "
  "INPUT={input} "
  "OUTPUT={output.bam} "
  "SUMMARY={output.summary} "
  "BASE_QUALITY={params.base_qual} "
  "NUM_BASES_BELOW_QUALITY={params.bases_below_qual} "
  "BASE_RANGE={params.base_range} "
  "BARCODED_READ=1 "
  "DISCARD_READ=true "
  "TAG_NAME=XM "
  "2> {log}"

SnakeMake From line 64 of rules/align_reads.smk

shell:
  "FilterBAM "
  "INPUT={input} "
  "OUTPUT={output} "
  "TAG_REJECT=XQ "
  "2> {log}"

SnakeMake From line 85 of rules/align_reads.smk

shell:
  "TrimStartingSequence "
  "INPUT={input} "
  "OUTPUT={output.bam} "
  "OUTPUT_SUMMARY={output.summary} "
  "SEQUENCE={params.adapter_sequence} "
  "MISMATCHES={params.mismatches} "
  "NUM_BASES={params.num_bases} "
  "2> {log}"

SnakeMake From line 105 of rules/align_reads.smk

shell:
  "PolyATrimmer "
  "INPUT={input} "
  "OUTPUT={output.bam} "
  "OUTPUT_SUMMARY={output.summary} "
  "MISMATCHES={params.mismatches} "
  "NUM_BASES={params.num_bases} "
  "2> {log}"

SnakeMake From line 127 of rules/align_reads.smk

shell:
  "picard SamToFastq "
  "INPUT={input} "
  "FASTQ={output} "
  "2> {log}"

SnakeMake Picard From line 144 of rules/align_reads.smk

shell:
  "STAR --runThreadN {threads} "
  "--genomeDir {input.genomedir} "
  "--readFilesIn {input.fastq} "
  "--outFileNamePrefix {params.outprefix} "
  "--readFilesCommand zcat "
  "--outSAMtype BAM Unsorted ; "

SnakeMake STAR From line 162 of rules/align_reads.smk

shell:
  "picard SortSam "
  "INPUT={input} "
  "OUTPUT={output} "
  "SORT_ORDER=queryname "
  "2> {log}"

SnakeMake Picard From line 183 of rules/align_reads.smk

shell:
  "picard MergeBamAlignment "
  "ALIGNED_BAM={input.aligned} "
  "UNMAPPED_BAM={input.unaligned} "
  "REFERENCE_SEQUENCE={input.reference} "
  "OUTPUT={output} "
  "INCLUDE_SECONDARY_ALIGNMENTS=false "
  "PAIRED_RUN=false "
  "2> {log}"

SnakeMake Picard From line 202 of rules/align_reads.smk

shell:
  "TagReadWithGeneExon "
  "INPUT={input.bam} "
  "OUTPUT={output.bam} "
  "ANNOTATIONS_FILE={input.annot} "
  "TAG=GE "
  "CREATE_INDEX=true "
  "2> {log}"

SnakeMake From line 222 of rules/align_reads.smk

shell:
  "BAMTagHistogram "
  "INPUT={input} "
  "OUTPUT={output} "
  "TAG=XC "
  "READ_QUALITY={params.read_quality} "
  "2> {log}"

SnakeMake From line 241 of rules/align_reads.smk

script:
  "../scripts/processing/align_report.Rmd"

SnakeMake From line 264 of rules/align_reads.smk

shell:
  "python scripts/processing/extract_dge.py -i {input.umi_obs} {params.whitelist_arg}"
  "-o {output.dge} --sample {params.reads} --seed {params.seed} "
  "--tpt_threshold {params.tpt_threshold} 2> {log}"

SnakeMake From line 46 of rules/analyses.smk

script:
  "../scripts/analyses/advanced_dge_downsampling.R"

SnakeMake From line 68 of rules/analyses.smk

script:
  "../scripts/analyses/reads_on_target.R"

SnakeMake From line 82 of rules/analyses.smk

script:
  "../scripts/analyses/tapseq_vs_cropseq.Rmd"

SnakeMake From line 103 of rules/analyses.smk

script:
  "../scripts/analyses/filter_mmix_umi_obs.R"

SnakeMake From line 114 of rules/analyses.smk

shell:
  "python scripts/processing/extract_dge.py -i {input.umi_obs} -o {output.dge} "
  "{params.whitelist_arg} --sample {params.reads} --seed {params.seed} --tpt_threshold "
  "{params.tpt_threshold} 2> {log}"

SnakeMake From line 134 of rules/analyses.smk

script:
  "../scripts/analyses/screen_data_qc.Rmd"

SnakeMake From line 158 of rules/analyses.smk

shell:
  "python scripts/analyses/collapse_perturbations.py -i {input.pert_status} -t {input.grna_targets} "
  "-o {output} 2> {log}"

SnakeMake From line 170 of rules/analyses.smk

script:
  "../scripts/analyses/differential_expression.R"

SnakeMake From line 195 of rules/analyses.smk

script:
  "../scripts/analyses/compare_covariates.Rmd"

SnakeMake From line 212 of rules/analyses.smk

script:
  "../scripts/analyses/process_de_results.R"

SnakeMake From line 231 of rules/analyses.smk

script:
  "../scripts/analyses/map_enhancers.Rmd"

SnakeMake From line 251 of rules/analyses.smk

shell:
  "wget -O {output} {params.url}"

SnakeMake From line 261 of rules/analyses.smk

shell:
  "wget -O {output.bam} {params.url} ; picard BuildBamIndex I={output.bam}"

SnakeMake Picard From line 272 of rules/analyses.smk

shell:
  "rm -r data/k562_chromatin_data/HiC/5kb_resolution_intrachromosomal; "
  "wget -c {params.url} -O - | tar -xz K562/5kb_resolution_intrachromosomal/chr*/MAPQG0/; "
  "mv K562/5kb_resolution_intrachromosomal "
  "data/k562_chromatin_data/HiC/5kb_resolution_intrachromosomal; "
  "rmdir K562"

SnakeMake From line 284 of rules/analyses.smk

script:
  "../scripts/analyses/chromatin_analyses_screen.Rmd"

SnakeMake From line 300 of rules/analyses.smk

script:
  "../scripts/analyses/hic_analysis.Rmd"

SnakeMake From line 314 of rules/analyses.smk

script:
  "../scripts/analyses/abc_analysis_screen.Rmd"

SnakeMake From line 324 of rules/analyses.smk

shell:
  "Rscript scripts/analyses/extract_cropseq_vectors.R -b {input.bam} -d {input.dge_summary} "
  "-p {params.vector_pattern} -o {output}"

SnakeMake From line 337 of rules/analyses.smk

script:
  "../scripts/analyses/screen_grna_synthesis_errors.Rmd"

SnakeMake From line 349 of rules/analyses.smk

script:
  "../scripts/chromatin_annotated_etps/enhancer_screen_etps.R"

SnakeMake From line 18 of rules/chromatin_annotated_etps.smk

script:
  "../scripts/chromatin_annotated_etps/genome_wide_etps.R"

SnakeMake From line 35 of rules/chromatin_annotated_etps.smk

script:
  "../scripts/chromatin_annotated_etps/gasperini_screen_etps.R"

SnakeMake From line 52 of rules/chromatin_annotated_etps.smk

script:
  "../scripts/chromatin_annotated_etps/gasperini_pilot_etps.R"

SnakeMake From line 69 of rules/chromatin_annotated_etps.smk

script:
  "../scripts/chromatin_annotated_etps/fulco_tiling_etps.R"

SnakeMake From line 86 of rules/chromatin_annotated_etps.smk

shell:
  "wget -O {output.gtf}.gz {params.gtf_url} 2> {log} && gzip -d {output.gtf}.gz ; "
  "wget -O {output.fasta}.gz {params.fasta_url} 2>> {log} && gzip -d {output.fasta}.gz"

SnakeMake From line 22 of rules/create_alignment_refs.smk

shell:
  "Rscript scripts/processing/create_tapseq_annot.R -t {input} -b {params.BSgenome} "
  "-f {output.fasta} -o {output.gtf} 2> {log}"

SnakeMake From line 38 of rules/create_alignment_refs.smk

shell:
  "python scripts/processing/cropseq_vector_reference.py -i {input} -o {params.output_bn} "
  "--prefix {params.vector_prefix}"

SnakeMake From line 53 of rules/create_alignment_refs.smk

shell:
  "touch {output.fasta} {output.gtf}"

SnakeMake From line 62 of rules/create_alignment_refs.smk

shell:
  "cat {input.genome_gtf} > {output.cropseq_ref_gtf} && "
  "cat {input.vectors_gtf} >> {output.cropseq_ref_gtf} ; "
  "cat {input.genome_fasta} > {output.cropseq_ref_fasta} && "
  "cat {input.vectors_fasta} >> {output.cropseq_ref_fasta} ; "

SnakeMake From line 76 of rules/create_alignment_refs.smk

shell:
  "picard CreateSequenceDictionary "
  "REFERENCE={input} "
  "OUTPUT={output} "
  "2> {log}"

SnakeMake Picard From line 90 of rules/create_alignment_refs.smk

shell:
  "ConvertToRefFlat "
  "ANNOTATIONS_FILE={input.annot} "
  "SEQUENCE_DICTIONARY={input.seq_dict} "
  "OUTPUT={output} "
  "2> {log}"

SnakeMake From line 105 of rules/create_alignment_refs.smk

shell:
  "mkdir -p {output} && "  # create genomeDir directory
  "STAR --runThreadN {threads} "
  "--runMode genomeGenerate "
  "--genomeDir {output} "
  "--genomeFastaFiles {input.fasta} "
  "--sjdbGTFfile {input.gtf} "
  "--sjdbOverhang {params.sjdb_overhang} "
  "--outFileNamePrefix {params.outprefix} ; "
  "mv data/alignment_references/{wildcards.align_ref}/star.Log.out {log}"

SnakeMake STAR From line 124 of rules/create_alignment_refs.smk

shell:
  "cmd='fastq-dump --split-files --gzip --outdir {params.outdir} {wildcards.run}' ; "
  "echo $cmd > {log} ; $cmd >> {log}"

SnakeMake From line 31 of rules/download_fastq.smk

shell:
  "mv -v {params.outdir}/{params.access}_1.fastq.gz {output.fwd} > {log} & "
  "mv -v {params.outdir}/{params.access}_2.fastq.gz {output.rev} >> {log}"

SnakeMake From line 47 of rules/download_fastq.smk

shell:
  "bash scripts/download_fastq/merge_fastq.sh {params.outdir} {output.fwd} {output.rev} "
  "> {log}"

SnakeMake From line 65 of rules/download_fastq.smk

shell:
  "bash scripts/download_fastq/merge_fastq_i7.sh {params.outdir} {output.fwd} {output.rev} "
  "> {log}"

SnakeMake From line 84 of rules/download_fastq.smk

shell:
  "bash scripts/download_fastq/merge_fastq_i7.sh {params.outdir} {output.fwd} {output.rev} "
  "> {log} ; trimmomatic SE {output.rev} {params.outdir}/tmp.txt.gz CROP:58 2>> {log} ; "
  "mv -v {params.outdir}/tmp.txt.gz {output.rev} >> {log}"

SnakeMake Trimmomatic From line 103 of rules/download_fastq.smk

shell:
  "GatherMolecularBarcodeDistributionByGene "
  "INPUT={input} "
  "OUTPUT={output} "
  "NUM_CORE_BARCODES={params.ncells} "
  "EDIT_DISTANCE={params.edit_distance} "
  "READ_MQ={params.read_mq} "
  "MIN_BC_READ_THRESHOLD={params.min_umi_reads} "
  "RARE_UMI_FILTER_THRESHOLD={params.rare_umi_filter} "
  "2> {log}"

SnakeMake From line 42 of rules/extract_dge.smk

shell:
  "python scripts/processing/extract_dge.py -i {input.umi_obs} -o {output.dge} "
  "{params.whitelist_arg} --tpt_threshold {params.tpt_threshold} 2> {log}"

SnakeMake From line 69 of rules/extract_dge.smk

shell:
  "Rscript scripts/processing/perturbation_status.R --infile {input} --outfile {output} "
  "--vector_patter {params.vector_prefix} --min_txs {params.min_txs} --trim 2> {log}"

SnakeMake From line 88 of rules/extract_dge.smk

script:
  "../scripts/processing/dge_report.Rmd"

SnakeMake From line 106 of rules/extract_dge.smk

script:
  "../scripts/processing/dge_report_no_perts.Rmd"

SnakeMake From line 119 of rules/extract_dge.smk

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 19 of analyses/abc_analysis_screen.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# attach required packages
library(here)
library(pROC)
library(tidyverse)

R Markdown tidyverse here pROC From line 26 of analyses/abc_analysis_screen.Rmd

# load chromatin annotated ETPs generated by scripts/chromatin_annotated_etps/enhancer_screen_etps.R
pairs <- read.csv(here(snakemake@input), stringsAsFactors = FALSE)

R Markdown From line 36 of analyses/abc_analysis_screen.Rmd

# plot activity score vs interaction frequency
ggplot(pairs, aes(x = int_freq + 1, y = activity_score, color = prop_grna_hits)) +
  facet_wrap(~enh_chr, ncol = 2) +
  geom_point(data = filter(pairs, significant == 0), color = "gray69") +
  geom_point(data = filter(pairs, significant == 1)) +
  labs(x = "Hi-C interaction frequency", y = "Enhancer activity (DNAse + H3K27ac)",
       title = "Enhancer activity vs Hi-C contact frequency") +
  scale_color_gradient(low = "blue", high = "orangered", na.value = "gray69") +
  scale_x_log10() +
  scale_y_log10() +
  theme_bw()

R Markdown ggplot2 From line 47 of analyses/abc_analysis_screen.Rmd

# plot abc score vs distance to TSS
ggplot(pairs, aes(x = dist_to_tss, y = abc_score_norm, color = prop_grna_hits)) +
  facet_wrap(~enh_chr, ncol = 1) +
  geom_point(data = filter(pairs, significant == 0), color = "gray69", alpha = 0.25) +
  geom_point(data = filter(pairs, significant == 1)) +
  labs(x = "Distance to TSS", y = "ABC score", "ABC score vs distance to TSS") +
  scale_color_gradient(low = "blue", high = "orangered", na.value = "gray69") +
  scale_x_log10() +
  scale_y_log10() +
  theme_bw()

R Markdown ggplot2 From line 67 of analyses/abc_analysis_screen.Rmd

# plot abc score vs pvalue
ggplot(pairs, aes(x = abc_score_norm, y = -log10(pvalue), color = as.factor(significant))) +
  facet_wrap(~enh_chr, ncol = 2) +
  geom_point() +
  labs(x = "ABC score", color = "Significant") +
  scale_color_manual(values = c("1" = "steelblue", "0" = "gray69")) +
  theme_bw()

R Markdown ggplot2 From line 82 of analyses/abc_analysis_screen.Rmd

# function to calculate area under roc curve
calc_auroc <- function(category, prediction) {

  # order by prediction
  dat <- data.frame(category, prediction) %>%
    arrange(desc(prediction))

  # compute auc
  roc_obj <- roc(dat$category, dat$prediction)
  as.numeric(auc(roc_obj))

}

# only retain data on pairs that have at least one significant hit
sig_pairs <- pairs %>%
  group_by(sample, gene) %>%
  filter(sum(significant) > 0) %>%
  select(-c(2, 5:14, 23:29, 32))

# convert to long format and set pretty predictor names
sig_pairs_long <- sig_pairs %>%
  gather(key = "predictor", value = "predictor_value",
         -c(sample, perturbation, gene, significant)) %>%
  mutate(predictor = case_when(
    predictor == "int_freq" ~ "Hi-C int. freq.",
    predictor == "dist_to_tss" ~ "Distance to TSS",
    predictor == "activity_score" ~ "Activity score",
    predictor == "abc_score_norm" ~ "ABC score",
    TRUE ~ predictor))

# calculate auroc for all genes with significant enhancers
auroc <- sig_pairs_long %>%
  group_by(sample, predictor) %>%
  summarize(auc = calc_auroc(category = significant, prediction = predictor_value))

# plot auroc per gene and predictor
ggplot(auroc, aes(x = predictor, y = auc)) +
  facet_wrap(~sample, ncol = 2) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(y = "AUC", title = "AUROC") +
  coord_flip() +
  theme_bw() +
  theme(axis.title.x = element_blank())

R Markdown ggplot2 From line 99 of analyses/abc_analysis_screen.Rmd

# calculate mean and median auroc
auroc_stats <- auroc %>%
  group_by(predictor) %>%
  summarize(avg_auroc = mean(auc),
            med_auroc = median(auc))

# add predictor type for plot
auroc_stats <- auroc_stats %>%
  mutate(type = case_when(
    predictor %in% c("H3K27ac", "H3K27me3", "H3K4me1", "H3K4me3") ~ "Histone PTM",
    predictor %in% c("Hi-C int. freq.", "Distance to TSS") ~ "Proximity",
    predictor %in% c("Activity score", "ABC score") ~ "ABC model",
    predictor == "Dnase.seq" ~ "DNA accessibility",
    predictor == "POLR2A" ~ "Transcription"))

# plot average auroc per predictor
ggplot(auroc_stats, aes(x = fct_reorder(predictor, desc(avg_auroc)), y = avg_auroc, fill = type)) +
  geom_bar(stat = "identity") +
  labs(x = "Predictor", y = "Average AUROC", fill = "Type") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank())

R Markdown ggplot2 From line 145 of analyses/abc_analysis_screen.Rmd

# only retain data on pairs that have at least one significant hit within 5 Mb of the TSS
sig_pairs_1mb <- pairs %>%
  group_by(sample, gene) %>%
  filter(dist_to_tss <= 1e6) %>%
  filter(sum(significant) > 0) %>%
  select(-c(2, 5:14, 23:29, 32))

# compute number of enhancers per gene
enh_per_gene <- summarize(sig_pairs_1mb, enhancers = n())

# plot number of enhancers per gene
ggplot(enh_per_gene, aes(x = gene, y = enhancers)) +
  facet_wrap(~sample, scales = "free_x") +
  geom_bar(stat = "identity") +
  labs(x = "Gene", y = "Enhancers", title = "Enhancers per gene within 1Mb") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# convert to long format and set pretty predictor names
sig_pairs_1mb_long <- sig_pairs_1mb %>%
  gather(key = "predictor", value = "predictor_value",
         -c(sample, perturbation, gene, significant)) %>%
  mutate(predictor = case_when(
    predictor == "int_freq" ~ "Hi-C int. freq.",
    predictor == "dist_to_tss" ~ "Distance to TSS",
    predictor == "activity_score" ~ "Activity score",
    predictor == "abc_score_norm" ~ "ABC score",
    TRUE ~ predictor))

# calculate auroc for all genes with significant enhancers
auroc_1mb <- sig_pairs_1mb_long %>%
  group_by(sample, predictor) %>%
  summarize(auc = calc_auroc(category = significant, prediction = predictor_value))

# plot auroc per gene and predictor
ggplot(auroc_1mb, aes(x = predictor, y = auc)) +
  facet_wrap(~sample, ncol = 2) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(y = "AUC", title = "AUROC 1Mb") +
  coord_flip() +
  theme_bw() +
  theme(axis.title.x = element_blank())

R Markdown ggplot2 From line 173 of analyses/abc_analysis_screen.Rmd

# calculate mean and median auroc
auroc_stats_1mb <- auroc_1mb %>%
  group_by(predictor) %>%
  summarize(avg_auroc = mean(auc),
            med_auroc = median(auc))

# add predictor type for plot
auroc_stats_1mb <- auroc_stats_1mb %>%
  mutate(type = case_when(
    predictor %in% c("H3K27ac", "H3K27me3", "H3K4me1", "H3K4me3") ~ "Histone PTM",
    predictor %in% c("Hi-C int. freq.", "Distance to TSS") ~ "Proximity",
    predictor %in% c("Activity score", "ABC score") ~ "ABC model",
    predictor == "Dnase.seq" ~ "DNA accessibility",
    predictor == "POLR2A" ~ "Transcription"))

# plot average auroc per predictor
ggplot(auroc_stats_1mb, aes(x = fct_reorder(predictor, desc(avg_auroc)), y = avg_auroc,
                            fill = type)) +
  geom_bar(stat = "identity") +
  labs(x = "Predictor", y = "Average AUROC", fill = "Type") +
  scale_fill_brewer(palette = "Set1") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title.x = element_blank())

R Markdown ggplot2 From line 218 of analyses/abc_analysis_screen.Rmd

# function to calculate roc curve
calc_roc <- function(category, prediction) {

  # order by prediction
  dat <- data.frame(category, prediction) %>%
    arrange(desc(prediction))

  # compute roc
  roc_obj <- roc(dat$category, dat$prediction)

  # return tpr and fpr
  data.frame(TPR = rev(roc_obj$sensitivities), FPR = rev(1 - roc_obj$specificities))

}

# calculate roc for all genes with significant enhancers
roc_1mb <- sig_pairs_1mb_long %>%
  group_by(sample, predictor) %>%
  do(calc_roc(category = .$significant, prediction = .$predictor_value))

# order according to average auroc
order <- auroc_stats_1mb %>%
  arrange(desc(avg_auroc)) %>%
  pull(predictor)

roc_1mb <- roc_1mb %>%
  ungroup() %>%
  mutate(predictor = factor(predictor, levels = order))

# create labels to add auroc to plots
auroc_labels <- auroc_1mb %>%
  left_join(select(auroc_stats_1mb, predictor, avg_auroc), by = "predictor") %>%
  mutate(predictor = factor(predictor, levels = order)) %>%
  mutate(label = paste0("AUC: ", round(auc, digits = 2))) %>%
  mutate(x = 0.8, y = if_else(sample == "chr11", true = 0.25, false = 0.1))

# plot roc curves for every predictor
ggplot(roc_1mb, aes(x = FPR, y = TPR, color = sample)) +
  facet_wrap(~predictor, ncol = 2) +
  geom_line(size = 1) +
  geom_abline(lty = "dashed", color = "darkgray") +
  geom_text(data = auroc_labels, aes(x, y, label = label), show.legend = FALSE) +
  labs(color = "Region") +
  scale_color_manual(values = c("chr11" = "indianred3", "chr8" = "steelblue")) +
  theme_bw() +
  theme(legend.position = "top",
        legend.text = element_text(size = 12),
        legend.title = element_text(size = 13))

R Markdown ggplot2 From line 250 of analyses/abc_analysis_screen.Rmd

library(tidyverse)
library(here)

# define functions ---------------------------------------------------------------------------------

# calculate transcripts-per-transcript for umi observations to filter for chimeric reads
calc_tpt <- function(umi_obs) {

  # calculate total number of reads for every cbc - umi combination
  total_reads <- umi_obs %>%
    group_by(Cell.Barcode, Molecular_Barcode) %>%
    summarize(total_reads = sum(Num_Obs))

  # add total reads to umi_obs and calculate TPT
  umi_obs %>%
    left_join(total_reads, by = c("Cell.Barcode", "Molecular_Barcode")) %>%
    mutate(tpt = Num_Obs / total_reads) %>%
    select(-total_reads)

}

# filter according to a specified tpt threshold
filter_tpt <- function(umi_obs_tpt, tpt_threshold) {
  umi_obs_tpt %>%
    filter(tpt >= tpt_threshold) %>%
    select(-tpt)
}

# downsample umi_obs for a specifc number of genic reads
downsample_reads <- function(umi_obs, reads, level = c("cell", "sample")) {

  level <- match.arg(level)

  # "uncount" umi_observations so that each read is a row by repeating each line number-of-reads
  # times (i.e. num_obs)
  reps <- rep(seq_len(nrow(umi_obs)), times = umi_obs$Num_Obs)
  umi_obs_reads <- umi_obs[reps, -4]

  # randomly sample reads per cell or the whole sample, depending on level parameter
  if (level == "cell") {
    sampled_reads <- umi_obs_reads %>%
      group_by(Cell.Barcode) %>%
      sample_n(size = reads, replace = FALSE)
  }else{
    sampled_reads <- sample_n(umi_obs_reads, size = reads, replace = FALSE)
  }

  # count observations of each sample-cell-umi combination
  sampled_umi_obs <- sampled_reads %>%
    group_by(Cell.Barcode, Gene, Molecular_Barcode) %>%
    summarize(Num_Obs = n()) %>%
    ungroup()

}

# extract dge data for each sample by calculating number of umis per gene and  cell
extract_dge <- function(umi_obs){
  umi_obs %>%
    count(Cell.Barcode, Gene) %>%
    rename(txs = n)
}

# convert dge from long format to wide matrix
create_dge_matrix <- function(dge_long) {
  dge_long %>%
    spread(key = Cell.Barcode, value = txs, fill = 0) %>%
    as.data.frame(stringsAsFactors == FALSE) %>%
    rename(GENE = Gene)
}


# load input data ----------------------------------------------------------------------------------

# get parameters
sample_reads <- as.numeric(snakemake@wildcards$reads)
sampling <- match.arg(snakemake@wildcards$sampling, choices = c("fixed", "avg"))
panel <- match.arg(snakemake@wildcards$panel, choices = c("onGenes", "onTarget"))

# set sed for reproducible sampling
set.seed(snakemake@params$seed)

# load input data
umi_obs <- read.table(here(snakemake@input$umi_obs), header = TRUE, stringsAsFactors = FALSE,
                      sep = "\t")
target_genes <- read.csv(here(snakemake@input$target_genes), stringsAsFactors = FALSE)
cbc_whitelist <- readLines(here(snakemake@input$whitelist))

# filter for cell barcodes on CBC whitelist
umi_obs_filt <- filter(umi_obs, Cell.Barcode %in% cbc_whitelist)

# remove any vector transcripts if vector_pattern is provided
if (!is.null(snakemake@params$vector_pattern)) {

  umi_obs_filt <- filter(umi_obs_filt, !grepl(Gene, pattern = snakemake@params$vector_pattern))

}

# only retain target genes if panel is set to onTarget
if (panel == "onTarget") {

  umi_obs_filt <- filter(umi_obs_filt, Gene %in% target_genes$gene)

}


# downsample genic reads ---------------------------------------------------------------------------

if (sampling == "fixed") {

  # calculate number of reads per cell
  reads_per_cell <- umi_obs_filt %>%
    group_by(Cell.Barcode) %>%
    summarize(reads = sum(Num_Obs))

  # filter for cells that have at least the desired sampling size of reads
  cells_filt <- reads_per_cell %>%
    filter(reads >= sample_reads) %>%
    pull(Cell.Barcode)

  # retain umi observations ony for these cells
  umi_obs_filt <- filter(umi_obs_filt, Cell.Barcode %in% cells_filt)

  # downsample umi observations
  ds_umi_obs <- downsample_reads(umi_obs_filt, reads = sample_reads, level = "cell")

}else{

  # calculate number of reads to sample from umi_observation
  ncells <- n_distinct(umi_obs_filt$Cell.Barcode)
  sample_reads <- ncells * sample_reads

  # abort if scaled sampling size is larger than the number of available reads
  if (sample_reads > sum(umi_obs_filt$Num_Obs)) {
    stop("Not enough input reads for desired sampling size per sample!")
  }

  # downsample umi observations
  ds_umi_obs <- downsample_reads(umi_obs_filt, reads = sample_reads, level = "sample")

}

# extract dge --------------------------------------------------------------------------------------

# calculate tpt and filter for chimeric reads using the specified tpt threshold
umi_obs_tpt <- calc_tpt(ds_umi_obs)
umi_obs_tpt <- filter_tpt(umi_obs_tpt, tpt_threshold = snakemake@params$tpt_threshold)

# extract dge and create dge matrix
dge <- extract_dge(umi_obs_tpt)
dge_mat <- create_dge_matrix(dge)

# output file
if (tools::file_ext(snakemake@output[[1]]) == "gz") {
  outfile <- gzfile(here(snakemake@output[[1]]))
}else{
  outfile <- here(snakemake@output[[1]])
}

# save dge matrix to output file
write.table(dge_mat, file = outfile, sep = "\t", row.names = FALSE, quote = FALSE)

R tidyverse here From line 5 of analyses/advanced_dge_downsampling.R

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 16 of analyses/chromatin_analyses_screen.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# disable all dplyr progress bars
options(dplyr.show_progress = FALSE)

# attach required packages
library(ggpubr)
library(here)
library(kableExtra)
library(rtracklayer)
library(tidyverse)

R Markdown tidyverse ggpubr here kableExtra From line 23 of analyses/chromatin_analyses_screen.Rmd

# load processed de output (includes confidence level and distance to TSS)
results <- here(snakemake@input$processed_results) %>%
  read.csv(stringsAsFactors = FALSE) %>%
  mutate(sample = paste0("Chr", sub("iScreen1", "", sample)),
         significant = if_else(pval_adj_allTests < 0.05, true = "sig", false = "non_sig"))

# encode chip-seq files
chip_files <- here(snakemake@input$encode_chip)
names(chip_files) <- sub("_.*", "", basename(chip_files))

R Markdown From line 38 of analyses/chromatin_analyses_screen.Rmd

# extract cis enhancer - gene pairs, remove significant pairs that increase gene expression
cis_enh_perts <- results %>%
  filter(enh_type == "cis", abs(dist_to_tss) >= 1000) %>%
  filter(!(significant == "sig" & manual_lfc > 0))

# only retain hits within the same target region (discard out of region control enhancers) and add
# "chr" to enhancer chromosome for use with ENCODE data
cis_enh_perts <- cis_enh_perts %>%
  filter(enh_chr == sub("Chr", "", sample)) %>%
  mutate(enh_chr = paste0("chr", enh_chr),
         gene_chr = paste0("chr", gene_chr))

# get enhancer coordinates and convert to GRanges object
enh_coords <- cis_enh_perts %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE) %>%
  `names<-`(.$perturbation)  # set names to perturbation id

# load encode chip-seq data
chip_data <- lapply(chip_files, FUN = import, which = range(enh_coords), format = "bigWig")

R Markdown bigWig From line 59 of analyses/chromatin_analyses_screen.Rmd

# function to calculate average assay signal per enhancer
avg_assay_signal <- function(assay, enh_coords) {
  overlaps <- findOverlaps(enh_coords, assay)
  overlaps <- split(overlaps, from(overlaps))
  names(overlaps) <- names(enh_coords[as.numeric(names(overlaps))])
  sapply(overlaps, function(x) {
    sum(mcols(assay[to(x)])$score * width(assay[to(x)])) / sum(width(assay[to(x)]))
  })
}

# calculate average chromatin signal at enhancers
enh_chrom_signal <- chip_data %>%
  sapply(FUN = avg_assay_signal, enh_coords = enh_coords) %>%
  as.data.frame() %>%
  rownames_to_column(var = "perturbation")

# transform to data.frame and merge with cis enhancer - gene pairs
cis_enh_perts <- cis_enh_perts %>%
  select(sample, perturbation, enh_chr, gene, pvalue, prop_grna_hits, dist_to_tss, significant) %>%
  left_join(enh_chrom_signal, by = "perturbation")

R Markdown From line 83 of analyses/chromatin_analyses_screen.Rmd

# function to randomly draw n non-significant pairs for a significant association. any enhancers
# from drawn non-significant pairs are excluded from subsequent draws
draw_control_pairs <- function(sig, non_sig, n = 3) {

  # get distance to tss bins for which a control pair needs to be drawn (repeat n times)
  bins <- select(sig, enh_chr, dTSS_bin)
  bins <- bins[rep(seq_len(nrow(bins)), each = n), ]

  # iteratively select a random negative control for each bin and remove any pairs involving that
  # enhancer from pool of non-significant pairs for any subsequenct draws
  random_ctrls <- data.frame()
  for (i in seq_len(nrow(bins))) {

    # get pairs on same chromosome and same distance to TSS bin
    nonSig_bin <- filter(non_sig, enh_chr == bins[i, "enh_chr"],
                         dTSS_bin == bins[i, "dTSS_bin"])

    # randomly draw a control pair (if possible)
    if (nrow(nonSig_bin)) {

      ctrl <- sample_n(nonSig_bin, size = 1)
      random_ctrls <- rbind(random_ctrls, ctrl)

      # remove any pairs involving that enhancer from non_sig
      non_sig <- filter(non_sig, perturbation != ctrl$perturbation)

    }else{
      warning("Not enough unique enhancers to draw control!\n", call. = FALSE)
    }
  }

  return(random_ctrls)

}

R Markdown From line 119 of analyses/chromatin_analyses_screen.Rmd

# create distance to TSS bins
dtss_range <- range(cis_enh_perts$dist_to_tss)
bins <- seq(from = dtss_range[1], to = dtss_range[2], by = 1e4)

# bin distance to tss
cis_enh_perts <- cis_enh_perts %>%
  mutate(dTSS_bin = cut(dist_to_tss, bins))

# extract significant enhancer - gene associations
sig_pairs <- filter(cis_enh_perts, significant == "sig")

# get other tested enhancer - gene pairs but whose enhancers appear only in non-sigificant tests
nonSig_pairs <- cis_enh_perts %>%
  filter(significant == "non_sig", !perturbation %in% sig_pairs$perturbation,
         perturbation != "GATA1")  # no GATA1, because it's the only on chrX

# randomly draw 3 control pairs for each significant enhancer - gene association
set.seed(20190617)
nonSig_ctrls <- draw_control_pairs(sig_pairs, non_sig = nonSig_pairs, n = 3)

# combine significant and control pairs
all_pairs <- bind_rows(sig_pairs, nonSig_ctrls)

R Markdown From line 156 of analyses/chromatin_analyses_screen.Rmd

# plot distance to TSS for significant pairs and sampled non-significant control pairs
ggplot(all_pairs, aes(x = dist_to_tss, fill = significant)) +
  facet_wrap(~sample) +
  geom_histogram(bins = 30) +
  labs(x = "Distance to TSS", y = "pairs",
       title = "Distance to TSS of significant vs non-significant control pairs") +
  theme_bw()

# figure caption
cap <- "Distance to TSS distribution of significant enhancers and sampled non-significant controls"

R Markdown ggplot2 From line 181 of analyses/chromatin_analyses_screen.Rmd

# extract unique enhancers from gene-enhancer pairs and convert to long format
enh_plot <- all_pairs %>%
  select(-c(sample, gene, pvalue, prop_grna_hits, dist_to_tss, dTSS_bin)) %>%
  distinct() %>%
  gather(key = "assay", value = "fc_over_ctrl", -c(perturbation, enh_chr, significant))

# pairwise comparisons using wilcoxon test and format adjusted p-value for plotting
pw_tests <- compare_means(fc_over_ctrl ~ significant, group.by = "assay", data = enh_plot,
                          method = "wilcox.test", p.adjust.method = "holm") %>%
  mutate(p.adj.plot = if_else(p.adj < 0.001, true = "< 0.001",
                              false = paste("=", round(p.adj, digits = 2))))

# plot difference in enhancer relevant chromatin marks between significant and non-significant hits
ggplot(enh_plot, aes(x = significant, y = fc_over_ctrl + 1, fill = significant)) +
  facet_wrap(~assay, ncol = 2) +
  geom_boxplot(notch = TRUE) +
  stat_pvalue_manual(pw_tests, label = "P adj. {p.adj.plot}", y.position = log10(145),
                     inherit.aes = FALSE) +
  labs(y = "Chromatin FC over control") +
  scale_x_discrete(labels = c("Non significant", "Significant")) +
  scale_y_log10(limits = c(NA, 180)) +
  scale_fill_manual(values = c(non_sig = "darkgray", sig = "steelblue")) +
  theme_bw() + 
  theme(axis.title.x = element_blank(), legend.position = "none", text = element_text(size = 15),
        panel.grid = element_blank())

# figure caption
cap <- "Chromatin marks of significant enhancers vs non-significant control enhancers"

R Markdown ggplot2 From line 197 of analyses/chromatin_analyses_screen.Rmd

# add label for confidence level based on prop gRNA hits
all_pairs_conf <- all_pairs %>%
  filter(!is.na(prop_grna_hits)) %>%
  mutate(conf = if_else(significant == "sig" & prop_grna_hits >= 0.5,
                        true = "High confidence", false = "Non significant"),
         conf = if_else(significant == "sig" & prop_grna_hits < 0.5,
                        true = "Low confidence", false = conf))

# convert to long format for plot
all_pairs_conf <- all_pairs_conf %>%
  gather(key = "assay", value = "fc_over_ctrl", -c(1:8, dTSS_bin, conf))

# perform kruskal-wallis test (non-parametric one-way anova) per assay and reformat adjusted p-value
kw_tests <- compare_means(formula = fc_over_ctrl ~ conf, method = "kruskal.test",
                          data = all_pairs_conf, group.by = "assay") %>%
  mutate(p.adj.plot = if_else(p.adj < 0.001, true = "< 0.001",
                              false = paste("=", round(p.adj, digits = 2)))) %>%
  mutate(group1 = "High confidence", group2 = "Low confidence")  # add fake groups for text in plots

# plot chromatin signal for high-, low-confidence and non significant enhancer - gene pairs
all_pairs_conf %>%
  filter(assay != "H3K27me3") %>%
  ggplot(., aes(x = conf, y = fc_over_ctrl + 1, fill = conf)) +
  facet_wrap(~assay, ncol = 4) +
  geom_boxplot(notch = TRUE) +
  stat_pvalue_manual(data = filter(kw_tests, assay != "H3K27me3"), y.position = 130, size = 5.5,
                     label = "P adj. {p.adj.plot}", remove.bracket = TRUE, inherit.aes = FALSE) +
  labs(y = "FC over control") +
  scale_x_discrete(labels = c("High conf.", "Low conf.", "Non sig.")) +
  scale_fill_manual(values = c("High confidence" = "goldenrod2",
                               "Low confidence" = "darkslategray3", "Non significant" = "gray")) +
  scale_y_log10(limits = c(NA, 180)) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none", text = element_text(size = 20),
        panel.grid = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

# figure caption
cap <- "Chromatin mark signal versus enhancer confidence level"

# save plot for manuscript
ggsave(here("results/plots", "chromatin_sig_vs_conf.pdf"), device = "pdf", width = 9,
       height = 3.5)

R Markdown ggplot2 From line 235 of analyses/chromatin_analyses_screen.Rmd

# plot H3K27me3 vs confidence levels
all_pairs_conf %>%
  filter(assay == "H3K27me3") %>%
  ggplot(., aes(x = conf, y = fc_over_ctrl + 1, fill = conf)) +
  facet_wrap(~assay) +
  geom_boxplot(notch = TRUE) +
  stat_pvalue_manual(data = filter(kw_tests, assay == "H3K27me3"), y.position = 8.7, size = 5.5,
                     label = "P adj. {p.adj.plot}", remove.bracket = TRUE, inherit.aes = FALSE) +
  labs(y = "FC over control") +
  scale_x_discrete(labels = c("High conf.", "Low conf.", "Non sig.")) +
  scale_fill_manual(values = c("High confidence" = "goldenrod2",
                               "Low confidence" = "darkslategray3", "Non significant" = "gray")) +
  scale_y_log10(limits = c(NA, 10)) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none", text = element_text(size = 20),
        panel.grid = element_blank(), axis.text.x = element_text(angle = 45, hjust = 1))

# figure caption
cap <- "H3k27me3 signal versus enhancer confidence level"

R Markdown ggplot2 From line 280 of analyses/chromatin_analyses_screen.Rmd

# perform individual pairwise tests
pw_tests <- compare_means(fc_over_ctrl ~ conf, group.by = "assay", data = all_pairs_conf,
                          method = "wilcox.test", p.adjust.method = "holm")

# number of etps per group
etps_per_group <- count(all_pairs_conf, assay, conf)

# add number of etps per group and reformat for printing
pw_tests <- pw_tests %>%
  left_join(etps_per_group, by = c("assay" = "assay", "group1" = "conf")) %>% 
  left_join(etps_per_group, by = c("assay" = "assay", "group2" = "conf")) %>% 
  select(assay, group1, group2, n.group1 = n.x, n.group2 = n.y, p.format, p.signif, p.adj) %>%
  rename(p.value = p.format)

# create pretty html table
pw_tests %>%
  kable() %>%
  kable_styling(bootstrap_options = c("striped", "condensed"), full_width = FALSE,
                position = "left")

R Markdown From line 307 of analyses/chromatin_analyses_screen.Rmd

# save pairwise tests as .csv file
write.csv(pw_tests, file = here("results/chromatin_pairwise_tests.csv"), row.names = FALSE)

R Markdown From line 329 of analyses/chromatin_analyses_screen.Rmd

from __future__ import print_function
import sys, argparse
import pandas as pd

# define functions ---------------------------------------------------------------------------------

# helper function to print to stderr
def eprint(*args, **kwargs):
  print(*args, file = sys.stderr, **kwargs)

# collapse perturbations by target
def collapse_perts(perts, targets):

  # add targets to perturbation status and remove VECTOR from data.frame
  perts = pd.merge(targets, perts, how = 'right', on = 'VECTOR').drop(columns = 'VECTOR')

  # count number of perturbations per target
  perts = perts.groupby('TARGET').sum()

  return perts

# execute if run as main program -------------------------------------------------------------------

if __name__ == '__main__':

  # parse command line arguments
  parser = argparse.ArgumentParser(description = 'Collapse perturbations by gRNA targets.')

  parser.add_argument('-i', '--inputfile', help = 'File containing perturbation status data to \
    merge. Typically output of perturbation_status.R', required = True)
  parser.add_argument('-o', '--outputfile', help = 'Output file for collapsed perturbation data',
    required = True)
  parser.add_argument('-t', '--targets', help = 'Two column .csv file listing every gRNA vector \
    (column named VECTOR) and their targets (column named TARGET).', required = True)
  parser.add_argument('-c', '--count', help = 'Should perturbations be counted rather than \
    returned as binary matrix (default)', action='store_true')

  args = parser.parse_args()

  # load perturbation and vector target files
  eprint('Loading input files...')
  perts = pd.read_csv(args.inputfile, sep = '\t')
  targets = pd.read_csv(args.targets)

  # collapse pertubations by vector targets
  eprint('Collapsing perturbations...')
  perts = collapse_perts(perts, targets)

  # convert to binary matrix unless specified otherwise
  if not args.count:
    eprint('Converting collapsed perturbations to binary...')
    perts[perts > 0] = 1

  eprint('Writing collapsed perturbations to file...')
  perts.to_csv(args.outputfile, sep = '\t', index = True)

  eprint('Done!')

Python Pandas From line 6 of analyses/collapse_perturbations.py

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 17 of analyses/compare_covariates.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# attach required packages
library(corrplot)
library(eulerr)
library(factoextra)
library(gridExtra)
library(here)
library(rtracklayer)
library(tidyverse)

R Markdown tidyverse gridExtra here factoextra corrplot eulerr From line 24 of analyses/compare_covariates.Rmd

# dge files
dge_files <- here(snakemake@input$dge)
names(dge_files) <- basename(dirname(dge_files))

# load dge data
dge <- lapply(dge_files, FUN = read.table, header = TRUE, row.names = "GENE")

# differential expression results
de_files <- here(snakemake@input$results)

# extract relevant information from filenames
de_files_info <- de_files %>%
  sub(".*data/", "", .) %>%
  sub("/diff_expr/output_MAST_", "_", .) %>%
  sub("Covar.csv", "", .) %>%
  sub("_no", "_none", .)

# set names and load all de output files
names(de_files) <- de_files_info
de_results <- lapply(de_files, FUN = read.csv, stringsAsFactor = FALSE)

# merge into one data.frame and reformat id variable
de_results <- de_results %>%
  bind_rows(.id = "id") %>%
  separate(id, into = c("sample", "strategy", "covars"), sep = "_")

# load target gene annotations
annot <- lapply(here(snakemake@input$annot), FUN = import, format = "gtf")

# merge into one GRanges object
annot <- annot %>%
  do.call("c", .) %>%
  unique()

# filter for exons of protein-coding genes (no processed transcripts etc)
annot <- annot[annot$type == "exon" &
                 annot$gene_biotype == "protein_coding" &
                 annot$transcript_biotype == "protein_coding"
               ]

# split by gene name into a GRangesList
genes <- split(annot, f = annot$gene_name)

R Markdown From line 38 of analyses/compare_covariates.Rmd

# function to extract gene expression data from total dge
extract_genex <- function(x, vector_pattern) {
  gene_rows <- grep(rownames(x), pattern = vector_pattern, invert = TRUE)
  x[gene_rows, ]
}

# function to remove dge data of cells from certain 10x lanes
filter_lanes <- function(x, lanes) {
  cells <- colnames(x)
  cell_lanes <- substr(cells, start = 1, stop = 8)
  cells_filt <- cells[!cell_lanes %in% lanes]
  x[, cells_filt]
}

# 10x lanes excluded from DE tests
rm_lanes <- unlist(snakemake@config$map_enhancers$remove_lanes)
rm_lanes[rm_lanes == "none"] <- NA

# extract gene expression data and filter for 10x lanes
genex <- dge %>%
  lapply(FUN = extract_genex, vector_pattern = snakemake@params$vector_pattern) %>%
  mapply(FUN = filter_lanes, x = ., lanes = rm_lanes, SIMPLIFY = FALSE)

# perform pca on gene expression data
pca <- genex %>%
  lapply(FUN = t) %>%
  lapply(FUN = prcomp, scale. = TRUE)

R Markdown From line 89 of analyses/compare_covariates.Rmd

# quickly plot cell scores for first 2 principal components
p1 <- fviz_pca_ind(pca$`11iScreen1`, geom = "point", title = "Chromosome 11", col.ind = "steelblue")
p2 <- fviz_pca_ind(pca$`8iScreen1`, geom = "point", title = "Chromosome 8", col.ind = "steelblue")

# print plots
grid.arrange(p1, p2, ncol = 2)

R Markdown From line 119 of analyses/compare_covariates.Rmd

# extract eigenvalues
eig_val <- lapply(pca, FUN = get_eigenvalue)

# process eigenvalues (top 20)
eig_val_top20 <- eig_val %>%
  lapply(FUN = rownames_to_column, var = "Dim") %>%
  lapply(FUN = head, 20) %>%
  bind_rows(.id = "sample") %>%
  mutate(Dim = as.integer(sub("Dim\\.", "", Dim))) %>%
  mutate(gt1 = if_else(eigenvalue > 1, true = ">1", false = "<=1"))

# plot eigenvalues per PC
ggplot(eig_val_top20, aes(x = factor(Dim), y = eigenvalue, fill = gt1)) +
  facet_wrap(~sample, ncol = 1) +
  geom_bar(stat = "identity") +
  geom_hline(yintercept = 1, lty = "dashed", col = "firebrick2") +
  labs(x = "PC", title = "Eigenvalues of top 30 PCs", fill = "eigenvalue") +
  scale_fill_manual(values = c(">1" = "steelblue", "<=1" = "darkgray")) +
  theme_bw()

R Markdown ggplot2 From line 129 of analyses/compare_covariates.Rmd

# scree plot showing percentage of variance explained per PC
p1 <- fviz_eig(pca$`11iScreen1`, ncp = 11, addlabels = TRUE, ylim = c(0, 40), main = "Chr11")
p2 <- fviz_eig(pca$`8iScreen1`, ncp = 10, addlabels = TRUE, ylim = c(0, 40), main = "Chr8")

# get cummulative variance explained of top 11, resp 10 PCs
cum_prop_chr11 <- filter(eig_val_top20, sample == "11iScreen1")[11, 5]
cum_prop_chr8 <- filter(eig_val_top20, sample == "8iScreen1")[10, 5]

# print plots
grid.arrange(p1, p2, top = "Scree plots")

R Markdown From line 154 of analyses/compare_covariates.Rmd

# plot correlation circle
p1 <- fviz_pca_var(pca$`11iScreen1`, col.var = "contrib", alpha.var = 0.5, title = "Genes chr11",
                   gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))
p2 <- fviz_pca_var(pca$`8iScreen1`, col.var = "contrib", alpha.var = 0.5, title = "Genes chr8",
                   gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"))

grid.arrange(p1, p2, ncol = 2)

R Markdown From line 172 of analyses/compare_covariates.Rmd

# plot contribution of all genes to 
p1 <- fviz_contrib(pca$`11iScreen1`, choice = "var", axes = 1,
                   title = "Contribution of genes to PC1")
p2 <- fviz_contrib(pca$`11iScreen1`, choice = "var", axes = 2,
                   title = "Contribution of genes to PC2")

grid.arrange(p1, p2, top = "Chromosome 11")

R Markdown From line 185 of analyses/compare_covariates.Rmd

# plot contribution of all genes to 
p1 <- fviz_contrib(pca$`8iScreen1`, choice = "var", axes = 1,
                   title = "Contribution of genes to PC1")
p2 <- fviz_contrib(pca$`8iScreen1`, choice = "var", axes = 2,
                   title = "Contribution of genes to PC2")

grid.arrange(p1, p2, top = "Chromosome 8")

R Markdown From line 195 of analyses/compare_covariates.Rmd

# function to compute average expression, standard deviation and coefficient of variation
gene_expr_stats <- function(gene_expr) {
  avg_expr <- rowMeans(gene_expr)
  sd_expr <- apply(gene_expr, MARGIN = 1, FUN = sd)
  coeff_var <- sd_expr / avg_expr

  data.frame(avg_expr, sd_expr, coeff_var, check.rows = TRUE) %>%
    rownames_to_column(var = "gene")
}

# compute gene expression stats for every gene in both samples and reformat
gene_stats <- genex %>%
  lapply(gene_expr_stats) %>%
  bind_rows(.id = "sample") %>%
  gather(key = "stat", value = "value", -sample, -gene)

# extract contributions of genes to PCs 1 & 2 and reformat
contrib <- pca %>%
  lapply(FUN = function(x) get_pca_var(x)$contrib[,1:2]) %>%
  lapply(FUN = as.data.frame) %>%
  lapply(rownames_to_column, var = "gene") %>%
  bind_rows(.id = "sample") %>%
  rename(PC1 = Dim.1, PC2 = Dim.2) %>%
  gather(key = "PC", value = "contribution", -sample, -gene)

# add gene expression statistics to contributions
contrib <- left_join(contrib, gene_stats, by = c("sample", "gene"))

# plot correlation of contribution with gene stats for chromosome 11
p1 <- contrib %>%
  filter(sample == "11iScreen1", stat != "sd_expr") %>%
  ggplot(., aes(x = value, y = contribution / 100)) +
  facet_grid(PC~stat, scales = "free")+
  geom_point(color = "steelblue") +
  labs(x = "Value", y = "Contribution (%)", title = "Contribution vs. gene stats (chr11)") +
  scale_y_log10(labels = scales::percent) +
  scale_x_log10() +
  theme_bw()

# plot correlation of contribution with gene stats for chromosome 8
p2 <- contrib %>%
  filter(sample == "8iScreen1", stat != "sd_expr") %>%
  ggplot(., aes(x = value, y = contribution / 100)) +
  facet_grid(PC~stat, scales = "free")+
  geom_point(color = "steelblue") +
  labs(x = "Value", y = "Contribution (%)", title = "Contribution vs. gene stats (chr8)") +
  scale_y_log10(labels = scales::percent) +
  scale_x_log10() +
  theme_bw()

# print plots
grid.arrange(p1, p2, ncol = 1)

R Markdown ggplot2 From line 209 of analyses/compare_covariates.Rmd

# function to compute interesting gene expression properties per cell
cell_dge_stats <- function(dge, vector_pattern) {

  # separate dge into gene and vector expression
  vctr_rows <- grep(rownames(dge), pattern = vector_pattern)
  gene_expr <- dge[-vctr_rows, ]
  vctr_expr <- dge[vctr_rows, ]

  # calculate stats
  ngenes <- colSums(gene_expr > 0)
  total_txs <- colSums(gene_expr)

  # calculate vector stats
  total_vctr <- colSums(vctr_expr)

  data.frame(ngenes, total_txs, total_vctr, check.rows = TRUE) %>%
    rownames_to_column(var = "cell") 
}

# calculate dge statistics per experiment
cell_stats <- dge %>%
  lapply(FUN = cell_dge_stats, vector_pattern = snakemake@params$vector_pattern) %>%
  bind_rows(.id = "sample")

R Markdown From line 268 of analyses/compare_covariates.Rmd

# function to correlate with cell level stats
correlate_pcs <- function(x, method = "spearman") {

  # remove sample variable and make cell rownames
  x <- select(x, -sample) %>%
    `rownames<-`(NULL) %>%  # remove existing row names
    column_to_rownames(var = "cell")

  # separate x into PCs and meta variables
  pc_cols <- grep(colnames(x), pattern = "^PC")
  pc_rot <- x[, pc_cols]
  meta_vars <- x[, -pc_cols]

  # correlate pcas with meta vars
  cor(pc_rot, meta_vars, method = method)

}

# extract scores for first 5 PCs for both experiments
pc_scores <- pca %>%
  lapply(FUN = function(pca) as.data.frame(pca$x[, 1:5])) %>%
  lapply(FUN = rownames_to_column, var = "cell") %>%
  bind_rows(.id = "sample")

# add cell level stats to pc_scores
pc_scores_stats <- pc_scores %>%
  left_join(cell_stats, by = c("sample", "cell"))

# correlate PCs with cell-level stats
corr_mats <- pc_scores_stats %>%
  split(., f = .$sample) %>%
  map(., .f = correlate_pcs)

# create correlation plots
layout(matrix(1:2, ncol = 2))
corrplot(corr_mats$`11iScreen1`, title = "11iScreen1", mar=c(0,0,2,0), cl.align.text = "l",
         cl.ratio = 0.5)
corrplot(corr_mats$`8iScreen1`, title = "8iScreen1", mar=c(0,0,2,0), cl.align.text = "l",
         cl.ratio = 0.5)

R Markdown From line 294 of analyses/compare_covariates.Rmd

# plot first and second PC colored according to number of expressed genes
p1 <- ggplot(pc_scores_stats, aes(x = PC1, y = PC2, color = ngenes)) +
  facet_wrap(~sample) +
  geom_point() +
  labs(title = "Number of detected genes", color = "Genes") +
  coord_fixed() +
  theme_bw()

# plot first and second PC colored according to number of gene transcripts
p2 <- ggplot(pc_scores_stats, aes(x = PC1, y = PC2, color = total_txs)) +
  facet_wrap(~sample) +
  geom_point() +
  labs(title = "Gene transcripts", color = "Transcripts") +
  coord_fixed() +
  theme_bw()

# plot first and second PC colored according to number of vector transcripts
p3 <- ggplot(pc_scores_stats, aes(x = PC1, y = PC2, color = log10(total_vctr))) +
  facet_wrap(~sample) +
  geom_point() +
  labs(title = "Vector transcripts", color = "log10(Transcripts)") +
  coord_fixed() +
  theme_bw()

# print plots
p1
p2
p3

R Markdown ggplot2 From line 336 of analyses/compare_covariates.Rmd

# apply multiple testing correction per pc covar and strategy
de_results <- de_results %>%
  group_by(strategy, covars) %>%
  rename(pval_adj_sample = pval_adj_allTests) %>%
  mutate(pval_adj_allTests = p.adjust(pvalue, method = "fdr"))

# extract perEnh hits
enh_perts <- de_results %>%
  filter(strategy == "perEnh") %>%
  mutate(significant = if_else(pval_adj_allTests < 0.05, true = "sig", false = "not_sig"))

# function to collapse gRNA ids per targeted enhancer
collapse_grnas <- function(grnas) {
  grnas <- sub("_.+", "", grnas)
  sub("\\.[A|B|C|D]", "", grnas)
}

# count number of gRNA hits per enhancer-gene pair
gRNA_hits <- de_results %>%
  filter(strategy == "perGRNA") %>%
  mutate(perturbation = collapse_grnas(perturbation)) %>%
  group_by(sample, covars, perturbation, gene) %>%
  summarize(grna_hits = sum(pval_adj_allTests < 0.05),
            prop_grna_hits = mean(pval_adj_allTests < 0.05))

# add single gRNA hits to enhancer hits
enh_perts <- left_join(enh_perts, gRNA_hits, by = c("sample", "covars", "perturbation", "gene"))

R Markdown From line 377 of analyses/compare_covariates.Rmd

# get all significant hits and reformat to wide format, so that each row is one enhancer - target
# gene link and "covariate columns" specify whether that associatiation was discovered or not
pairs <- enh_perts %>%
  ungroup() %>%
  filter(significant == "sig") %>%
  select(sample, covars, perturbation, gene) %>%
  mutate(sig = 1) %>%
  spread(key = covars, value = sig, fill = 0) %>%
  mutate(total = rowSums(.[, -c(1:3)]))

# compute euler diagrams
euler_chr11 <- pairs %>%
  filter(sample == "11iScreen1") %>%
  select(-sample, -perturbation, -gene, -total) %>%
  euler()

euler_chr8 <- pairs %>%
  filter(sample == "8iScreen1") %>%
  select(-sample, -perturbation, -gene, -total) %>%
  euler()

# plot diagrams
p1 <- plot(euler_chr11, quantities = TRUE, labels = list(font = 4), main = "Chromosome 11")
p2 <- plot(euler_chr8, quantities = TRUE, labels = list(font = 4), main = "Chromosome 8")

# print plots
grid.arrange(p1, p2, ncol = 2)

R Markdown From line 409 of analyses/compare_covariates.Rmd

# calculate number of hits per confidence level
hits_conf_levels <- enh_perts %>%
  group_by(sample, covars, grna_hits) %>%
  summarize(hits = sum(pval_adj_allTests < 0.05))

# plot number of hits per confidence level
ggplot(hits_conf_levels, aes(x = as.factor(grna_hits), y = hits, fill = as.factor(covars))) +
  facet_wrap(~sample) +
  geom_bar(stat = "identity", position = "dodge") + 
  labs(x = "Individual gRNA hits", y = "Significant hits", title = "Confidence level",
       fill = "PC covariates") +
  theme_bw()

R Markdown ggplot2 From line 439 of analyses/compare_covariates.Rmd

# get hits that are found only in one unique test
unique_pairs <- pairs %>%
  right_join(enh_perts, by = c("sample", "perturbation", "gene")) %>%
  filter(total == 1)

# calculate number of hits per confidence level
unique_hits <- unique_pairs %>%
  group_by(sample, covars, grna_hits) %>%
  summarize(hits = sum(pval_adj_allTests < 0.05))

# plot confidence levels for these hits
ggplot(unique_hits, aes(x = factor(grna_hits), y = hits, fill = covars)) +
  facet_wrap(~sample) +
  geom_bar(stat = "identity", position = "dodge") + 
  labs(x = "Individual gRNA hits", y = "Significant hits",
       title = "Confidence of hits unique to one model", fill = "PC covariates") +
  theme_bw()

R Markdown ggplot2 From line 454 of analyses/compare_covariates.Rmd

# download chain file for hg19 to hg38 liftover
chain_url <- "http://hgdownload.soe.ucsc.edu/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz"
chain_file <- tempfile("hg19ToHg38", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg19_to_hg38_chain <- import.chain(tools::file_path_sans_ext(chain_file))

R Markdown From line 478 of analyses/compare_covariates.Rmd

# extract data on discovery perturbations
disc_perts <- enh_perts %>%
  filter(grepl(perturbation, pattern = "^chr.+$")) %>%
  as.data.frame()

# extract enhancer coordinates from perturbation id and create GRanges object
enh_coords_hg19 <- disc_perts %>%
  select(perturbation) %>%
  distinct() %>%
  separate(perturbation, into = c("chr", "start", "end"), remove = FALSE) %>%
  column_to_rownames(var = "perturbation") %>%
  makeGRangesFromDataFrame()

# liftover from hg19 to hg38
enh_coords_hg38 <- enh_coords_hg19 %>%
  liftOver(chain = hg19_to_hg38_chain) %>%
  unlist()

# compute center of every enhancer
enh_centers <- data.frame(perturbation = names(enh_coords_hg38),
                          enh_chr = sub("chr", "", as.character(seqnames(enh_coords_hg38))),
                          enh_center = round((start(enh_coords_hg38) + end(enh_coords_hg38)) / 2),
                          stringsAsFactors = FALSE)

# add enhancer centers to de results
disc_perts <- disc_perts %>%
  left_join(enh_centers, by = "perturbation")

R Markdown ucsc-liftover From line 489 of analyses/compare_covariates.Rmd

# get chromosome and strand for each gene
gene_chr <- unlist(unique(seqnames(genes)))
gene_strand <- unlist(unique(strand(genes)))

# function to get a genes TSS
get_tss <- function(x) {
  if (all(strand(x) == "+")) {
    min(start(x))
  }else if (all(strand(x) == "-")) {
    max(end(x))
  }else{
    stop("Inconsistent strand information!")
  }
}

# get TSS for each gene
gene_tss <- sapply(genes, FUN = get_tss)

# create data.frame with strand and tss coordinates for every gene
tss_coords <- data.frame(gene_chr, gene_strand, gene_tss, check.rows = TRUE) %>%
  rownames_to_column(var = "gene") %>%
  mutate_if(is.factor, as.character)

# add to discovery perturbation results
disc_perts <- left_join(disc_perts, tss_coords, by = "gene")

# calculate distance to tss for every enhancer - gene pair
disc_perts <- disc_perts %>%
  mutate(dist_to_tss = if_else(gene_strand == "+",
                               true  = enh_center - gene_tss,
                               false = gene_tss - enh_center))

# only retain enhancer - gene pairs on the same chromosomes
disc_perts_cis <- filter(disc_perts, enh_chr == gene_chr)

R Markdown From line 519 of analyses/compare_covariates.Rmd

# calculate -log10 p-value
disc_perts_cis <- mutate(disc_perts_cis, neg_log10_pvalue = -log10(pvalue))

# plot -log10 p-value as a function of distance to TSS
ggplot(disc_perts_cis,
             aes(x = dist_to_tss / 1e6, y = neg_log10_pvalue, color = prop_grna_hits)) +
  facet_grid(covars~sample) +
  geom_point(data = filter(disc_perts_cis, significant == "not_sig")) +
  geom_point(data = filter(disc_perts_cis, significant == "sig")) +
  labs(x = "Distance to TSS (Mb)", y = expression(-log[10] ~ p - value),
       title = "-Log10 p-value vs distance to TSS") +
  scale_color_gradient(low = "blue", high = "orangered") +
  theme_bw()

R Markdown ggplot2 From line 556 of analyses/compare_covariates.Rmd

# only select significant enhancer - gene pairs
enh_gene_pairs <- filter(disc_perts_cis, significant == "sig")

# plot the distance to TSS vs number of gRNA hits
ggplot(enh_gene_pairs, aes(x = as.factor(grna_hits), y = abs(dist_to_tss) / 1000, color = sample)) +
  facet_wrap(~covars, ncol = 2) +
  geom_point(position = position_jitterdodge()) +
  geom_boxplot(outlier.shape = NA, fill = NA) +
  labs(x = "Confidence level (gRNA hits)", y = "Distance to TSS (kb)",
       title = "Distance to TSS vs. confidence level") +
  scale_y_log10() +
  theme_bw()

R Markdown ggplot2 From line 572 of analyses/compare_covariates.Rmd

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

# required functions
source(file.path(snakemake@scriptdir, "SingleCellExperiment_tapseq.R"))
source(file.path(snakemake@scriptdir, "differential_expression_fun.R"))

# register parallel backend if specified (if more than 1 thread provided)
parallel <- FALSE
if (snakemake@threads > 1) {
  library(BiocParallel)
  register(MulticoreParam(workers = snakemake@threads))
  parallel <- TRUE
}

# set seed for reproducible results
if (snakemake@params$seed > 0) set.seed(snakemake@params$seed)

# prepare input data ===============================================================================

# load DGE data
dge <- read.table(snakemake@input$dge, header = TRUE, row.names = "GENE")

# load perturbation status data
perturb_status <- read.table(snakemake@input$perturb_status, header = TRUE, row.names = 1)

# remove negative controls, as these are not supposed to be tested
neg_ctrls <- grep(rownames(perturb_status), pattern = "^non-targeting.*$", perl = TRUE)
perturb_status <- perturb_status[-neg_ctrls, ]

# count cells per perturbation and save to file for later analyses
cells_per_pert <- rowSums(perturb_status > 0)
cells_per_pert <- data.frame(perturbation = rownames(perturb_status), cells = cells_per_pert)
write.csv(cells_per_pert, file = snakemake@output$ncells, row.names = FALSE)

# create tap-seq SingleCellExperiment object
sce <- create_tapseq_sce(dge = dge, perturb_status = perturb_status,
                         vector_pattern = snakemake@params$vector_pattern)

# remove cells from specified 10x lanes
if (!is.null(snakemake@params$exclude_lanes)) {

  # get cells that do not belong to filtered out lanes
  cells <- colnames(sce)
  cell_lanes <- substr(cells, start = 1, stop = 8)
  cells_filt <- cells[!cell_lanes %in% snakemake@params$exclude_lanes]

  # only retain data on filtered cells
  sce <- sce[, cells_filt]

}

# add covariates ===================================================================================

# covariates specified via output file name
covars <- snakemake@wildcards$covars

# compute covariates and add as colData to the SingleCellExperiment object
if (covars == "nGenesCovar") {

  # number of expressed genes per cell
  nGenes <- colSums(assay(sce) > 0)
  colData(sce) <- DataFrame(nGenes)

}else if (grepl(covars, pattern = "^\\d+pcCovar$", perl = TRUE)) {

  # perform PCA on gene expression data and extract PC scores of each cell
  pca_genex <- prcomp(t(assay(sce)), scale. = TRUE)
  pcs <- seq_len(as.integer(sub("pcCovar", "", covars)))
  pc_loadings <- DataFrame(pca_genex$x[, pcs])
  colData(sce) <- pc_loadings

}else if (covars != "noCovar") {
  stop("Incorrect covariate wildcard!", call. = FALSE)
}

# perform DE tests =================================================================================

# perform differential gene expression analysis
output <- test_differential_expression(sce, min_cells = snakemake@params$min_cells,
                                       method = snakemake@wildcards$method, parallel = parallel)

# save output
write.csv(output, file = snakemake@output$results, row.names = FALSE)

R From line 5 of analyses/differential_expression.R

library(optparse)

# create arguments list
option_list = list(
  make_option(c("-b", "--bam_file"), type = "character", default = NULL,
              help = "Path to bam file containing aligned reads",
              metavar = "character"),
  make_option(c("-d", "--dge_summary_file"), type = "character", default = NULL,
              help = "Path to txt file containing dge summary", metavar = "character"),
  make_option(c("-p", "--vector_pattern"), type = "character", default = NULL,
              help = "Pattern for reads mapping to CROP-seq vectors", metavar = "character"),
  make_option(c("-o", "--output_file"), type = "character", default = NULL,
              help = "Output file", metavar = "character")

)

# parse arguments
opt_parser = OptionParser(option_list = option_list)
opt = parse_args(opt_parser)

# function to check for required arguments
check_required_args <- function(arg, opt, opt_parser) {
  if (is.null(opt[[arg]])) {
    print_help(opt_parser)
    stop(arg, " argument is required!", call. = FALSE)
  }
}

# check that all required parameters are provided
required_args <- c("bam_file", "dge_summary_file", "vector_pattern", "output_file")
for (i in required_args) {
  check_required_args(i, opt = opt, opt_parser = opt_parser)
}

# define functions =================================================================================

# function to extract reads mapping to CROP-seq vectors and extract number of mismatches etc.
extract_vectors <- function(bam, cell_barcodes, output_file, vector_pattern) {

  # create cell barcode patterns and write to temporary file
  cbc_patterns <- paste0("XC:Z:", cell_barcodes)
  cbc_file <- tempfile(pattern = "cbc_patterns_", fileext = ".txt")
  write(cbc_patterns, file = cbc_file)

  # write heder to file
  header = c("cell_barcode", "umi_barcode", "aligned_vector_id", "mapping_quality", "mismatches",
             "mismatch_positions", "sequence")
  gz_outfile <- gzfile(output_file, open = "w")
  write(paste(header, collapse = "\t"), file = gz_outfile)
  close(gz_outfile)

  # bash command to filter bam file for CROP-seq vector reads from specified cells and to extract
  # mismatches, barcodes, and read sequence
  command <- paste("samtools view", bam, "| fgrep -f", cbc_file,
    "| fgrep", paste0("GE:Z:", vector_pattern),
    "| awk 'BEGIN { FS = \" \" }; {print $12\"\\t\"$20\"\\t\"$14\"\\t\"$5\"\\t\"$19\"\\t\"$13\"\\t\"$10}'",
    "| gzip >> ", output_file)

  # execute command
  system(command)

}

# extract CROP-seq vectors from provided bam file ==================================================

# read dge summary file and extract cell barcodes
dge_summary <- read.table(opt$dge_summary_file, header = TRUE, stringsAsFactors = FALSE)
cells <- dge_summary$cell_barcode

# extract CROP-seq vectors
extract_vectors(opt$bam_file, cell_barcodes = cells, output_file = opt$output_file,
                vector_pattern = opt$vector_pattern)

R SAMtools optparse From line 9 of analyses/extract_cropseq_vectors.R

umi_obs <- read.table(snakemake@input$umi_obs, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
target_genes <- read.csv(snakemake@input$target_genes, stringsAsFactors = FALSE)

# get genes that are expressed in all cell types
genes_all_cells <- target_genes[target_genes$marker == "all", "gene"]

# get umi observations for these genes
umi_obs_out <- umi_obs[umi_obs$Gene %in% genes_all_cells, ]

# save to output file
colnames(umi_obs_out)[1] <- "Cell Barcode"
write.table(umi_obs_out, file = snakemake@output[[1]], sep = "\t", row.names = FALSE, quote = FALSE)

R From line 4 of analyses/filter_mmix_umi_obs.R

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 13 of analyses/hic_analysis.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# disable dplyr progress
options(dplyr.show_progress = FALSE)

# attach required packages
library(ggpubr)
library(here)
library(HiTC)
library(Matrix)
library(tidyverse)
library(rtracklayer)
library(tools)

R Markdown tidyverse ggpubr here Matrix HiTC From line 20 of analyses/hic_analysis.Rmd

# load processed de output (includes confidence level and distance to TSS)
results <- here(snakemake@input$processed_results) %>%
  read.csv(stringsAsFactors = FALSE) %>%
  mutate(sample = paste0("chr", sub("iScreen1", "", sample)))

# extract cis-interactions on chromosome 8 and 11 and add column identifying significant hits
cis_enh_perts <- results %>%
  filter(enh_type == "cis", abs(dist_to_tss) >= 1000) %>%
  mutate(enh_chr = paste0("chr", enh_chr), gene_chr = paste0("chr", gene_chr),
         significant = if_else(pval_adj_allTests < 0.05, true = "sig", false = "non_sig"))

# remove validation controls on other chromosomes than the samples' target region and significant
# hits that increase gene expression
cis_enh_perts <- cis_enh_perts %>%
  filter(sample == enh_chr) %>%
  filter(!(significant == "sig" & manual_lfc > 0))

# download chain file for hg38 to hg19 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
chain_file <- tempfile("hg38ToHg19", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg38_to_hg19_chain <- import.chain(file_path_sans_ext(chain_file))

R Markdown From line 37 of analyses/hic_analysis.Rmd

# extract hg38 enhancer coordinates
enh_coords_hg38 <- cis_enh_perts %>%
  select(perturbation, enh_chr, enh_start, enh_end) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "enh_chr",
                           start.field = "enh_start", end.field = "enh_end")

# liftover enhancers from hg38 to hg19 and convert to data.frame
enh_coords_hg19 <- enh_coords_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, end, perturbation) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end)

# replace tested enhancer - gene pairs hg38 enhancer coordinates with hg19 coordinates
cis_enh_perts <- cis_enh_perts %>%
  select(-c(enh_chr, enh_start, enh_end)) %>%
  left_join(enh_coords_hg19, by = "perturbation")

R Markdown ucsc-liftover From line 65 of analyses/hic_analysis.Rmd

# extract gene TSS coordinates
gene_tss_hg38 <- cis_enh_perts %>%
  select(gene, gene_chr, gene_tss, gene_strand) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "gene_chr",
                           start.field = "gene_tss", end.field = "gene_tss",
                           strand.field = "gene_strand")

# liftover tss coordinates to hg19
gene_tss_hg19 <- gene_tss_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, strand, gene) %>%
  rename(gene_chr = seqnames, gene_tss = start, gene_strand = strand)

# replace gene tss hg38 coordinates with hg19 coordinates
cis_enh_perts <- cis_enh_perts %>%
  select(-c(gene_chr, gene_tss, gene_strand)) %>%
  left_join(gene_tss_hg19, by = "gene")

# recalculate distance to tss for hg19 coordinates
cis_enh_perts <- cis_enh_perts %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                                false = gene_tss - enh_center)) %>%
  select(-enh_center)

R Markdown ucsc-liftover From line 87 of analyses/hic_analysis.Rmd

# add label for confidence level based on prop gRNA hits
cis_enh_perts <- cis_enh_perts %>%
  filter(!is.na(prop_grna_hits)) %>%
  mutate(conf = case_when(
    significant == "non_sig" ~ "Non significant",
    significant == "sig" & prop_grna_hits >= 0.5 ~ "High confidence",
    significant == "sig" & prop_grna_hits <  0.5 ~ "Low confidence")
  )

R Markdown From line 117 of analyses/hic_analysis.Rmd

# function to import HiC data from Rao et al for one chromosome and create a HTCexp object
import_hic <- function(sparse_cm_file, chromosome, resolution, bins) {

  # load sparse contact matrix file (only observed contacts)
  obs_contacts <- read.table(sparse_cm_file, col.names = c("i", "j", "M_ij"), sep = "\t")

  # get starting coordinates of assessed genomic bins at 5kb resolution
  max_bin <- (bins - 1) * resolution
  bin_starts <- seq(from = 0, to = max_bin, by = resolution)

  # create GRanges object containing all assessed bins for that chromosome
  bin_coords <- GRanges(seqnames = chromosome,
                        ranges = IRanges(start = bin_starts, end = bin_starts + resolution - 1,
                                         names = paste0("bin_", seq_len(length(bin_starts))))
                        )

  # convert starting coordinates of bins in sparse matrix input to bin ids by dividing by the
  # resolution (and add 1 to get correct index)
  obs_contacts_bins <- data.frame(i = round(obs_contacts$i / resolution + 1),
                                  j = round(obs_contacts$j / resolution + 1), 
                                  M_ij = obs_contacts$M_ij)

  # create sparse contact matrix from observed contacts
  sparse_cm <- sparseMatrix(i = obs_contacts_bins$i, j = obs_contacts_bins$j,
                            x = obs_contacts_bins$M_ij, symmetric = TRUE, dims = c(bins, bins))

  # create HTCexp object containing data for the given chromosome
  HTCexp(intdata = sparse_cm, xgi = bin_coords, ygi = bin_coords)

}

R Markdown From line 132 of analyses/hic_analysis.Rmd

# k562 intrachromosomal sparse matrix and krnorm vector files for chromosomes 8 and 11
scm_files <- here(snakemake@input$hic_raw)
names(scm_files) <- sub("_5kb.RAWobserved", "", basename(scm_files))

krnorm_files <- here(snakemake@input$hic_norm)
names(krnorm_files) <- sub("_5kb.KRnorm", "", basename(krnorm_files))

# import normalization vectors
chr8_KRnorm  <- as.numeric(readLines(krnorm_files["chr8"]))
chr11_KRnorm <- as.numeric(readLines(krnorm_files["chr11"]))

# infer number of bins per chromosome based on the normalization vectors
chr8_bins  <- length(chr8_KRnorm)
chr11_bins <- length(chr11_KRnorm)

# import hi-c data for these chromosomes
chr8_hic  <- import_hic(scm_files["chr8"],  chromosome = "chr8", resolution = 5000,
                        bins = chr8_bins)
chr11_hic <- import_hic(scm_files["chr11"], chromosome = "chr11", resolution = 5000,
                        bins = chr11_bins)

R Markdown From line 165 of analyses/hic_analysis.Rmd

# function to normalize Hi-C data based on provided normalization vectors
normalize_hic <- function(htc_obj, norm_vector) {

  # extract raw observed interaction matrix
  raw_obs <- intdata(htc_obj)

  # create normalization matrix by computing the outer product of the normalization vector
  norm_mat <- outer(norm_vector, norm_vector)

  # multiply observed interactions by normalization matrix and add back to HTC object
  intdata(htc_obj) <- raw_obs / norm_mat
  return(htc_obj)

}

# normalize HiC data
chr8_hic_norm  <- normalize_hic(chr8_hic,  norm_vector = chr8_KRnorm)
chr11_hic_norm <- normalize_hic(chr11_hic, norm_vector = chr11_KRnorm)

R Markdown From line 188 of analyses/hic_analysis.Rmd

# infer chromosomal region range
region_coords <- cis_enh_perts %>%
  select(sample, enh_start, enh_end, gene_tss) %>%
  gather(key = "key", value = "coord", -sample) %>%
  group_by(sample) %>%
  summarize(start = min(coord), end = max(coord))

# calculate bin coordinates that contain the entire regions
region_bins <- region_coords %>%
  mutate(start = floor(start / 5000) * 5000,
         end = ceiling(end / 5000) * 5000)

# extract data for assessed regions
chr8_region_hic <- extractRegion(chr8_hic_norm, MARGIN = c(1, 2), chr = "chr8",
                                 from = pull(filter(region_bins, sample == "chr8"), start),
                                 to = pull(filter(region_bins, sample == "chr8"), end))

chr11_region_hic <- extractRegion(chr11_hic_norm, MARGIN = c(1, 2), chr = "chr11",
                                  from = pull(filter(region_bins, sample == "chr11"), start),
                                  to = pull(filter(region_bins, sample == "chr11"), end))

R Markdown From line 209 of analyses/hic_analysis.Rmd

# function to extract interaction frequency versus distance between genomic bins
intFreq_over_distance <- function(htc_obj, type = "background") {

  # extract interaction matrix and compute distances between bins (in bp)
  ints <- intdata(htc_obj)
  dists <- intervalsDist(htc_obj)

  # get unique interactions (upper triangle of matrix, including diagonal), because matrix is
  # symmetric
  ints_unique <- suppressMessages(ints[upper.tri(ints, diag = TRUE)])
  dists_unique <- suppressMessages(dists[upper.tri(dists, diag = TRUE)])

  # create data.frame containing interaction frequency across all in distances
  int_over_dist <- data.frame(dist = dists_unique, int_freq = ints_unique, type = type,
                              stringsAsFactors = FALSE)

  # sort according to distance
  int_over_dist[order(int_over_dist$dist), ]

}

# compute expected background interaction frequency by calculating the distance between all observed
# genomic bins
chr8_int_over_dist  <- intFreq_over_distance(chr8_region_hic)
chr11_int_over_dist <- intFreq_over_distance(chr11_region_hic)

# combine into one data.frame and remove any NaN values, which were introduced because the
# normalization failed for certain bins
expect_int <- bind_rows(chr8 = chr8_int_over_dist, chr11 = chr11_int_over_dist, .id = "sample") %>%
  filter(!is.nan(int_freq))

R Markdown From line 239 of analyses/hic_analysis.Rmd

# function to compute the interaction frequency for enhancer - gene pairs, by finding the hi-c 
# genomic bins with overlap with the enhancer and the target gene tss. the interaction frequency of
# the pair is then defined as the interaction frequency of these bins
compute_int_freq <- function(pairs, htc_object) {

  # add pair identifier
  pairs$pair <- seq_len(nrow(pairs))

  # get coordinates of enhancer centers as GRanges object
  enh_coords <- pairs %>%
    mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
    select(enh_chr, enh_center) %>%
    makeGRangesFromDataFrame(., seqnames.field = "enh_chr", start.field = "enh_center",
                             end.field = "enh_center")

  # get gene tss coordinates as GRanges object
  tss_coords <- pairs %>%
    select(gene_chr, gene_tss) %>%
    makeGRangesFromDataFrame(., seqnames.field = "gene_chr", start.field = "gene_tss",
                             end.field = "gene_tss")

  # get bins overlapping for all enhancers and gene tss
  hic_bins <- x_intervals(htc_object)
  enh_bins <- findOverlaps(query = enh_coords, subject = hic_bins)
  tss_bins <- findOverlaps(query = tss_coords, subject = hic_bins)

  # combine into one data.frame
  enh_bins <- data.frame(pair = from(enh_bins), enh_bin = to(enh_bins))
  tss_bins <- data.frame(pair = from(tss_bins), tss_bin = to(tss_bins))
  bins <- full_join(enh_bins, tss_bins, by = "pair")

  # extract distance matrix between bins from htc object
  dists <- intervalsDist(htc_object)

  # calculate distances between bins of enhancer gene pairs
  dist_pairs <- dists[as.matrix(bins[, 2:3])]
  dist_pairs <- data.frame(pair = bins$pair, dist_bins = dist_pairs)

  # extract hi-c interaction matrix from htc object
  intdata <- intdata(htc_object)

  # get interaction frequencies for all bins and add pair id
  int_freq_pairs <- intdata[as.matrix(bins[, 2:3])]
  int_freq_pairs <- data.frame(pair = bins$pair, int_freq = int_freq_pairs)

  # add interaction frequencies and bin distances to pairs to create output
  pairs %>%
    left_join(dist_pairs, by = "pair") %>%
    left_join(int_freq_pairs, by = "pair") %>%
    select(-pair)

}

R Markdown From line 276 of analyses/hic_analysis.Rmd

# compute interaction frequencies for all tested enhancer - gene pairs
chr8_pairs <- cis_enh_perts %>%
  filter(enh_chr == "chr8", enh_chr == "chr8") %>%
  compute_int_freq(., htc_object = chr8_region_hic)

chr11_pairs <- cis_enh_perts %>%
  filter(enh_chr == "chr11", enh_chr == "chr11") %>%
  compute_int_freq(., htc_object = chr11_region_hic)

# combine into one data.frame and extract relevant columns
int_freq_pairs <- bind_rows(chr8_pairs, chr11_pairs) %>%
  select(sample, dist_bins, int_freq, significant, conf) %>%
  rename(dist = dist_bins, type = significant)

# add to expected background frequencies
int_freqs <- expect_int %>%
  mutate(conf = as.character(NA)) %>%
  bind_rows(int_freq_pairs)

R Markdown From line 331 of analyses/hic_analysis.Rmd

# add an amount equal to bin size to distance for plotting on log10 scale and reformat labels
int_freqs_plot <- int_freqs %>%
  mutate(dist = dist + 5000) %>%
  mutate(sample = sub("c", "C", sample)) %>%
  mutate(conf = sub(" confidence", "", conf),
         conf = sub("Non significant", "NS", conf))

# plot interaction frequencies between
p1 <- ggplot(drop_na(int_freqs_plot), aes(x = conf, y = int_freq + 1, color = conf)) +
  facet_wrap(~sample, ncol = 1) +
  geom_jitter(width = 0.2, size = 2.5) +
  geom_boxplot(color = "black", outlier.shape = NA, fill = NA) +
  labs(y = "Hi-C Interaction frequency", color = "Confidence:") +
  scale_color_manual(values = c("High" = "goldenrod2",
                                "Low" = "darkslategray3", "NS" = "gray")) +
  scale_y_log10(limits = c(NA, 1200), breaks = c(1, 10, 100, 1000)) +
  theme_bw() +
  theme(text = element_text(size = 22), panel.grid = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))

# plot interaction frequency as a function of distance
x_breaks <- c(0, 10^(4:10))  # breaks to draw on x axis (without added bin distance!)
p2 <- ggplot(int_freqs_plot, aes(x = dist, y = int_freq + 1)) +
  facet_wrap(~sample, ncol = 1) + 
  geom_point(data = filter(int_freqs_plot, type == "background"), pch = 4, size = 2,
             color = "black", alpha = 1) +
  geom_point(data = filter(int_freqs_plot, type == "non_sig"), aes(color = conf), alpha = 1,
             size = 2.5) +
  geom_point(data = filter(int_freqs_plot, type == "sig"), aes(color = conf), size = 2.5) +
  geom_smooth(data = filter(int_freqs_plot, type != "sig"), aes(linetype = type),
              color = "indianred3", se = FALSE, lwd = 2) +
  labs(x = "Distance (kb)", y = "Hi-C Interaction frequency", color = "Confidence",
       linetype = "Type") +
  scale_color_manual(values = c("High" = "goldenrod2", "Low" = "darkslategray3", "NS" = "gray")) +
  scale_x_log10(breaks = x_breaks + 5000, labels = scales::comma(x_breaks / 1000)) +
  scale_y_log10(limits = c(NA, 1200), breaks = c(1, 10, 100, 1000)) +
  theme_bw() +
  theme(text = element_text(size = 22), panel.grid = element_blank())

# arrange plots with common legend (props ggpubr!)
ggarrange(p1, p2 + theme(axis.title.y = element_blank()), ncol = 2, nrow = 1, common.legend = TRUE,
          legend = "top", widths = c(1, 2))

R Markdown ggplot2 From line 354 of analyses/hic_analysis.Rmd

# plot interaction frequencies across confidence levels and significant vs non-significant
p1 <- ggplot(drop_na(int_freqs_plot), aes(x = conf, y = int_freq + 1, color = conf)) +
  geom_jitter(width = 0.2, size = 2.5) +
  geom_boxplot(color = "black", outlier.shape = NA, fill = NA) +
  labs(y = "Hi-C Interaction frequency", color = "Confidence:") +
  scale_color_manual(values = c("High" = "goldenrod2",
                                "Low" = "darkslategray3", "NS" = "gray")) +
  scale_y_log10(limits = c(NA, 1200), breaks = c(1, 10, 100, 1000)) +
  theme_bw() +
  theme(text = element_text(size = 23.5), panel.grid = element_blank(),
        axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1))

# plot interaction frequency as a function of distance
x_breaks <- c(0, 10^(4:10))  # breaks to draw on x axis (without added bin distance!)
p2 <- ggplot(int_freqs_plot, aes(x = dist, y = int_freq + 1)) +
  geom_point(data = filter(int_freqs_plot, type == "background"), pch = 4, size = 2,
             color = "black", alpha = 1) +
  geom_point(data = filter(int_freqs_plot, type == "non_sig"), aes(color = conf), alpha = 1,
             size = 2.5) +
  geom_point(data = filter(int_freqs_plot, type == "sig"), aes(color = conf), size = 2.5) +
  geom_smooth(data = filter(int_freqs_plot, type != "sig"), aes(linetype = type),
              color = "indianred3", se = FALSE, lwd = 2) +
  labs(x = "Distance (kb)", y = "Hi-C Interaction frequency", color = "Confidence",
       linetype = "Type") +
  scale_color_manual(values = c("High" = "goldenrod2", "Low" = "darkslategray3", "NS" = "gray")) +
  scale_x_log10(breaks = x_breaks + 5000, labels = scales::comma(x_breaks / 1000)) +
  scale_y_log10(limits = c(NA, 1200), breaks = c(1, 10, 100, 1000)) +
  theme_bw() +
  theme(text = element_text(size = 22), panel.grid = element_blank())

# arrange plots with common legend (props ggpubr!)
ggarrange(p1, p2 + theme(axis.title.y = element_blank()), ncol = 2, nrow = 1, common.legend = TRUE,
          legend = "top", widths = c(1, 2))

# save plot for manuscript
ggsave(here("results/plots", "hic_interaction_frequencies_combined.png"), device = "png", width = 8,
       height = 4)
ggsave(here("results/plots", "hic_interaction_frequencies_combined.pdf"), device = "pdf", width = 8,
       height = 4)

R Markdown ggplot2 From line 402 of analyses/hic_analysis.Rmd

# separate pair interaction frequencies into significant and non-significant interactions
int_freq_sig <- filter(int_freqs, type == "sig")
int_freq_nonSig <- filter(int_freqs, type == "non_sig")
int_freq_background <- filter(int_freqs, type == "background")

# function to randomly select non-significant controls with same distance
sample_ctrls <- function(pair, nonSig_pairs, n = 2) {

  # get non-significant controls from the same sample and with the same bin distance
  ctrl_pairs <- filter(nonSig_pairs, sample == pair$sample, dist == pair$dist)

  # randomly select n controls
  ctrls <- sample_n(ctrl_pairs, size = n, replace = FALSE)

  # add confidence level of pair for which the control was drawn
  ctrls$conf <- pair$conf

  return(ctrls)

}

# randomly sample non-significant controls
set.seed(20190626)
sampled_ctrls <- int_freq_sig %>%
  rowwise() %>%
  do(sample_ctrls(pair = ., nonSig_pairs = int_freq_nonSig, n = 2))

# randomly sample background controls
sampled_bckgrnd <- int_freq_sig %>%
  rowwise() %>%
  do(sample_ctrls(pair = ., nonSig_pairs = int_freq_background, n = 2))

# combine significant pairs and sampled controls
all_int_pairs <- rbind(int_freq_sig, sampled_ctrls) %>%
  mutate(conf_plot = case_when(
    type == "sig" ~ conf,
    type == "non_sig" ~ "non_sig",
    type == "background" ~ "background"
  ))

# perform pairwise tests
pw_tests <- compare_means(int_freq ~ type, group.by = "conf", data = all_int_pairs,
                          method = "wilcox.test", p.adjust.method = "holm")

# plot interaction frequencies
ggplot(all_int_pairs, aes(x = fct_relevel(type, "sig", "non_sig"), y = int_freq + 1, fill = conf_plot)) +
  facet_wrap(~conf, ncol = 1) +
  geom_boxplot() +
  suppressWarnings(stat_pvalue_manual(data = pw_tests, y.position = log10(450), inherit.aes = FALSE,
                                      label = "p = {p.format}", size = 7, lwd = 0.75)) +
  labs(y = "Hi-C interaction frequency") +
  scale_fill_manual(values = c("High confidence" = "goldenrod2", "non_sig" = "gray",
                               "Low confidence" = "darkslategray3", "background" = "black")) +
  scale_y_log10(limits = c(NA, 1000)) +
  scale_x_discrete(labels = c("Sig.", "Non sig.")) +
  theme_bw() +
  theme(text = element_text(size = 25), legend.position = "none", axis.title.x = element_blank())

# save plot for manuscript
ggsave(here("results/plots", "hic_sig_vs_nonsig.pdf"), device = "pdf", width = 4, height = 7)

R Markdown ggplot2 From line 450 of analyses/hic_analysis.Rmd

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 16 of analyses/map_enhancers.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# attach required packages
library(ggrepel)
library(gridExtra)
library(here)
library(readxl)
library(rtracklayer)
library(tidyverse)
library(tools)

R Markdown tidyverse ggrepel gridExtra here readxl From line 23 of analyses/map_enhancers.Rmd

# files containing number of cell pert perturbation
ncell_files <- here(snakemake@input$ncells)

# set name for each file
samples <- basename(dirname(dirname(ncell_files)))
names(ncell_files) <- basename(ncell_files) %>%
  sub("ncells_MAST_", "", .) %>%
  sub("_.*", "", .) %>%
  paste(samples, ., sep = "_")

# files containing differential expression results
de_output_files <- here(snakemake@input$de_output)

# set name for each file
samples <- basename(dirname(dirname(de_output_files)))
names(de_output_files) <- basename(de_output_files) %>%
  sub("output_MAST_", "", .) %>%
  sub("_.*", "", .) %>%
  paste(samples, ., sep = "_")

# read files
ncells <- lapply(ncell_files, FUN = read.csv, stringsAsFactors = FALSE)
de_output <- lapply(de_output_files, FUN = read.csv, stringsAsFactors = FALSE)

# convert to one data.frame
ncells <- ncells %>%
  bind_rows(.id = "id") %>%
  separate(id, into = c("sample", "strategy"), sep = "_") %>%
  mutate(sample = paste0("Chr", sub("iScreen1", "", sample)))

de_output <- de_output %>%
  bind_rows(.id = "id") %>%
  separate(id, into = c("sample", "strategy"), sep = "_") %>%
  mutate(sample = paste0("Chr", sub("iScreen1", "", sample)))

# load processed de output (includes confidence level and distance to TSS)
results <- here(snakemake@input$processed_results) %>%
  read.csv(stringsAsFactors = FALSE) %>% 
  mutate(sample = paste0("Chr", sub("iScreen1", "", sample)))

# load perturbation status
perturb_status_files <- here(snakemake@input$perturb_status)
names(perturb_status_files) <- basename(dirname(perturb_status_files))
perturb_status <- lapply(perturb_status_files, FUN = read.table, header = TRUE,
                         row.names = "TARGET", stringsAsFactors = FALSE)

# load vector targets
vector_target_files <- here(snakemake@input$vector_targets)
names(vector_target_files) <- sub(".+_(chr\\d+)_.+", "\\1", basename(vector_target_files))
vector_targets <- vector_target_files %>%
  lapply(FUN = read.csv, stringsAsFactors = FALSE) %>%
  bind_rows(.id = "sample")

R Markdown From line 37 of analyses/map_enhancers.Rmd

# minimum number of cells per perturbation used for DE tests
min_cells <- snakemake@config$map_enhancers$min_cells %>%
  unlist() %>%
  data.frame(strategy = names(.), min_cells = .)

# calculate percentage of perturbation with at least min_cells cells
cells_perc <- ncells %>%
  group_by(sample, strategy) %>%
  summarize(cells = sum(cells >= min_cells[strategy, "min_cells"]),
            cells_perc = cells / n() * 100)

# plot number of cells pert perturbation
ggplot(ncells, aes(cells)) +
  facet_grid(strategy~sample, scales = "free") +
  geom_histogram(bins = 45) +
  geom_vline(data = min_cells, aes(xintercept = min_cells), color = "firebrick") +
  labs(title = "Cells per perturbation") +
  theme_bw()

R Markdown ggplot2 From line 98 of analyses/map_enhancers.Rmd

# correct for multiple testing across samples using FDR
de_output <- de_output %>%
  group_by(strategy) %>%
  rename(pval_adj_sample = pval_adj_allTests) %>%
  mutate(pval_adj_allTests = p.adjust(pvalue, method = "fdr"))

# plot p-value distribution
ggplot(de_output, aes(pvalue)) +
  facet_grid(strategy~sample, scales = "free") +
  geom_histogram(bins = 100) +
  labs(title = "P-value distribution") +
  theme_bw()

R Markdown ggplot2 From line 120 of analyses/map_enhancers.Rmd

# count the number and proportion of hits across different FDR values
hits <- de_output %>%
  group_by(sample, strategy) %>%
  summarize("0.05" = sum(pval_adj_allTests < 0.05),
            "0.1" = sum(pval_adj_allTests < 0.1),
            "0.15" = sum(pval_adj_allTests < 0.15),
            "0.2" = sum(pval_adj_allTests < 0.2),
            tests = n()) %>%
  gather(key = "fdr", value = "hits", -sample, -strategy, -tests) %>%
  mutate(fdr = as.numeric(fdr)) %>%
  mutate(prop_hits = hits / tests)

# plot number of hits as function of FDR
p1 <- ggplot(hits, aes(x = fdr, y = hits, color = sample, lty = strategy)) +
  geom_point() +
  geom_line() +
  labs(x = "FDR cutoff", y = "Significant tests", title = "Number of significant tests") +
  scale_color_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  theme_bw()

# plot proportion of hits as function of FDR
p2 <- ggplot(hits, aes(x = fdr, y = prop_hits, color = sample, lty = strategy)) +
  geom_point() +
  geom_line() +
  labs(x = "FDR cutoff", y = "Significant tests", title = "Proportion of significant tests") +
  scale_y_continuous(labels = scales::percent) +
  scale_color_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  theme_bw()

# print plots
grid.arrange(p1, p2, ncol = 1)

R Markdown ggplot2 From line 138 of analyses/map_enhancers.Rmd

# add variable for significant associations
results <- results %>%
  mutate(significant = if_else(pval_adj_allTests < 0.05, true = "sig", false = "non_sig"))

# calculate number of hits per confidence level
hits_conf_levels <- results %>%
  group_by(sample, grna_hits) %>%
  summarize(hits = sum(significant == "sig"))

# plot number of hits per confidence level
ggplot(hits_conf_levels, aes(x = as.factor(grna_hits), y = hits, fill = sample)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "gRNA hits (pvalue adj. < 0.05)", y = "Hits", title = "Hits per confidence level",
       fill = "Sample") +
  scale_fill_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  theme_bw()

R Markdown ggplot2 From line 177 of analyses/map_enhancers.Rmd

# known enhancer perturbations
known_enh <- c("GATA1", "HS2", "MYC", "ZFPM2")

# get all transfected enhancer perturbations
enh_pert_targets <- vector_targets %>%
  select(sample, TARGET) %>%
  distinct() %>%
  filter(grepl(TARGET, pattern = "chr\\d+:.*") | TARGET %in% known_enh) %>%
  mutate(sample = paste0(sub("chr", "", sample), "iScreen1"))

# only retain perturbation status on enhancer perturbations
enh_perts <- perturb_status %>%
  lapply(FUN = function(x) {
    enh_rows <- rownames(x) %in% enh_pert_targets$TARGET
    x[enh_rows, ]
  })

R Markdown From line 205 of analyses/map_enhancers.Rmd

# count number of cells per perturbation
cells_per_pert <- enh_perts %>%
  lapply(FUN = rowSums) %>%
  lapply(FUN = function(x) data.frame(pert = names(x), cells = x, stringsAsFactors = FALSE) ) %>%
  bind_rows(.id = "sample")

# add transfected, but not detected perturbations (in case there are any...)
cells_per_pert <- cells_per_pert %>%
  full_join(enh_pert_targets, by = c("sample" = "sample", "pert" = "TARGET")) %>%
  mutate(cells = replace_na(cells, replace = 0)) %>%
  mutate(sample = paste0("Chr", sub("iScreen1", "", sample)))

# plot number of cell per perturbation
ggplot(cells_per_pert, aes(x = sample, y = cells, fill = sample, color = sample)) +
  geom_violin() +
  labs(y = "Cells") +
  scale_fill_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  scale_color_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  scale_y_continuous(breaks = seq(0, 600, 100)) +
  theme_bw() +
  theme(text = element_text(size = 28), legend.position = "none", axis.title.x = element_blank(),
        panel.grid = element_blank())

# save plot for manuscript
ggsave(here("results/plots", "cells_per_pert.pdf"), device = "pdf", width = 5, height = 5)

R Markdown ggplot2 From line 226 of analyses/map_enhancers.Rmd

# count number of perturbations per cell
perts_per_cell <- perturb_status %>%
  lapply(FUN = colSums) %>%
  lapply(FUN = function(x) data.frame(cell = names(x), perts = x, stringsAsFactors = FALSE) ) %>%
  bind_rows(.id = "sample") %>%
  mutate(sample = paste0("Chr", sub("iScreen1", "", sample)))

# compute histogram
perts_per_cell_hist <- perts_per_cell %>%
  group_by(sample, perts) %>%
  summarize(cells = n()) %>%
  mutate(perts = factor(perts, levels = seq(0, max(.$perts))))

# plot perturbations per cell histogram
ggplot(perts_per_cell_hist, aes(x = perts, y = cells / 1000, fill = sample)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Perturbations per cell", y = "Cells (x 1000)",
       fill = "Region:") +
  scale_fill_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  scale_x_discrete(drop = FALSE) +
  theme_bw() +
  theme(text = element_text(size = 22), legend.text = element_text(size = 25),
        legend.position = "top", panel.grid = element_blank())

# save plot for manuscript
ggsave(here("results/plots", "perts_per_cell.pdf"), device = "pdf", width = 7,  height = 5)

R Markdown ggplot2 From line 256 of analyses/map_enhancers.Rmd

# extract cis enhancer - gene pairs and remove significant pairs that increase gene expression
cis_enh_perts <- results %>%
  filter(enh_type == "cis", abs(dist_to_tss) >= 1000) %>%
  filter(!(significant == "sig" & manual_lfc > 0))

# only retain hits within the same target region (discard out of region control enhancers)
cis_enh_perts <- cis_enh_perts %>%
  filter(enh_chr == sub("Chr", "", sample))

# count the number of enhancers for each gene
enh_per_gene <- cis_enh_perts %>%
  group_by(sample, gene) %>%
  summarize(enh = sum(significant == "sig"))

# compute proportion of genes with at least one discovered enhancer
prop_with_enh <- enh_per_gene %>%
  summarize(enh = sum(enh > 0),
            genes = n(),
            prop_enh = enh / genes)

# print table
knitr::kable(prop_with_enh)

# compute number of genes per enhancer count
enh_counts <- enh_per_gene %>%
  filter(enh > 0) %>%
  group_by(sample, enh) %>%
  summarize(genes = n()) %>%
  spread(key = "sample", value = "genes", fill = 0) %>%
  gather(key = "sample", value = "genes", -enh)

# plot number of genes per enhancer count
ggplot(enh_counts, aes(x = factor(enh, levels = seq_len(max(enh))), y = genes, fill = sample)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(x = "Enhancers", y = "Genes",
       fill = "Region:") +
  scale_fill_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  scale_x_discrete(drop = FALSE) +
  theme_bw() +
  theme(text = element_text(size = 25), panel.grid = element_blank(),
        legend.position = c(0.8, 0.85))

# save plot for manuscript
ggsave(here("results/plots", "enhancers_per_gene.pdf"), device = "pdf", width = 5, height = 5)

R Markdown ggplot2 From line 291 of analyses/map_enhancers.Rmd

# calculate -log10 p-value
cis_enh_perts <- mutate(cis_enh_perts, neg_log10_pvalue = -log10(pvalue))

# plot -log10 p-value as a function of distance to TSS
ggplot(cis_enh_perts, aes(x = dist_to_tss / 1e6, y = neg_log10_pvalue, color = significant)) +
  facet_wrap(~sample, ncol = 1, scales = "free") +
  geom_point(data = filter(cis_enh_perts, significant == "non_sig")) +
  geom_point(data = filter(cis_enh_perts, significant == "sig")) +
  labs(x = "Distance to TSS (Mb)", y = expression(-log[10] ~ p - value)) +
  scale_color_manual(values = c(non_sig = "darkgray", sig = "firebrick3")) +
  theme_bw()

R Markdown ggplot2 From line 342 of analyses/map_enhancers.Rmd

# set grna hits to NA for non-significant hits
cis_enh_perts_plot <- cis_enh_perts %>%
  mutate(grna_hits = if_else(significant != "sig", true = as.integer(NA), false = grna_hits),
         prop_grna_hits = if_else(significant != "sig", true = as.numeric(NA),
                                  false = prop_grna_hits))

# same plot, but color according to confidence level
ggplot(cis_enh_perts_plot,
             aes(x = dist_to_tss / 1e6, y = neg_log10_pvalue, color = prop_grna_hits)) +
  facet_wrap(~sample, ncol = 1, scales = "free") +
  geom_point(data = filter(cis_enh_perts_plot, significant == "non_sig")) +
  geom_point(data = filter(cis_enh_perts_plot, significant == "sig")) +
  labs(x = "Distance to TSS (Mb)", y = expression(-log[10] ~ p - value)) +
  scale_color_gradient(low = "blue", high = "orangered", na.value = "gray69") +
  theme_bw()

R Markdown ggplot2 From line 356 of analyses/map_enhancers.Rmd

# get hit closest to TSS
closest_hit <- cis_enh_perts_plot %>%
  filter(significant == "sig") %>%
  slice(which.min(abs(dist_to_tss)))

# get hits further away than the closest hit for plotting
cis_enh_perts_plot <- cis_enh_perts_plot %>%
  filter(abs(dist_to_tss) >= abs(closest_hit$dist_to_tss))

# plot decay of association strength over distance within 1Mb of TSS
p1 <- ggplot(cis_enh_perts_plot, aes(x = abs(dist_to_tss) / 1e6, y = neg_log10_pvalue,
                                     color = prop_grna_hits)) +
    labs(x = "Distance to TSS (Mb)", y = expression(-log[10] ~ p - value),
         color = "Confidence") +
    geom_point(data = filter(cis_enh_perts_plot, significant == "non_sig"), size = 3.5) +
    geom_point(data = filter(cis_enh_perts_plot, significant == "sig"), size = 3.5) +
    scale_color_gradient(low = "blue", high = "orangered", na.value = "gray69") +
    theme_bw() +
    theme(text = element_text(size = 25), panel.grid = element_blank())

# same plot, but with logarithmic distance
p2 <- ggplot(cis_enh_perts_plot, aes(x = abs(dist_to_tss) / 1e6, y = neg_log10_pvalue,
                                     color = prop_grna_hits)) +
    geom_smooth(data = filter(cis_enh_perts_plot, significant == "sig"),
                method = "loess", se = FALSE, color = "black", lwd = 2) +
    labs(x = "Distance to TSS (Mb)", y = expression(-log[10] ~ p - value)) +
    geom_point(data = filter(cis_enh_perts_plot, significant == "non_sig"), size = 2.5) +
    geom_point(data = filter(cis_enh_perts_plot, significant == "sig"), size = 2.5) +
    scale_x_log10(breaks = c(0.01, 0.1, 1, 10), labels = c("0.01", "0.1", "1", "10")) +
    scale_color_gradient(low = "blue", high = "orangered", na.value = "gray69") +
    theme_bw() +
    theme(text = element_text(size = 20), axis.title = element_blank(), legend.position = "none",
          panel.grid = element_blank(), plot.background = element_rect(fill = NA))

# add logarithmic plot as inset of linear plot
p1 + 
  annotation_custom(
    grob = ggplotGrob(p2),
    xmin = 18,
    xmax = 56,
    ymin = 40,
    ymax = 122
  )

# save plot for manuscript
ggsave(here("results/plots", "enh_pval_vs_dist.pdf"), device = "pdf", width = 8, height = 5)
ggsave(here("results/plots", "enh_pval_vs_dist.png"), device = "png", width = 8, height = 5)

R Markdown ggplot2 From line 374 of analyses/map_enhancers.Rmd

# only select significant enhancer - gene pairs
enh_gene_pairs <- filter(cis_enh_perts, significant == "sig")

# plot the distance to TSS vs number of gRNA hits
ggplot(drop_na(enh_gene_pairs),
       aes(x = as.factor(grna_hits), y = abs(dist_to_tss) / 1000,color = sample)) +
  geom_hline(yintercept = 1000, lty = "dashed", color = "gray42") +
  geom_jitter(width = 0.2) +
  geom_boxplot(color = "black", width = 0.2, fill = NA, outlier.shape = NA) +
  labs(x = "Confidence level (gRNA hits)", y = "Distance to TSS (kb)",
       title = "Distance to TSS vs confidence level", color = "Sample") +
  scale_color_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  scale_y_log10() +
  theme_bw() +
  theme(text = element_text(size = 20), panel.grid = element_blank())

# save plot for manuscript
ggsave(here("results/plots", "confidence_vs_dist.pdf"), device = "pdf", width = 7, height = 5)

R Markdown ggplot2 From line 428 of analyses/map_enhancers.Rmd

# add label for confidence level based on prop gRNA hits
cis_enh_perts_plot <- cis_enh_perts %>%
  filter(!is.na(prop_grna_hits)) %>%
  mutate(conf = if_else(significant == "sig" & prop_grna_hits >= 0.5,
                        true = "High confidence", false = "Non significant"),
         conf = if_else(significant == "sig" & prop_grna_hits < 0.5,
                        true = "Low confidence", false = conf))

# plot distance to TSS of significant enhacner gene - pairs vs non-significant pairs
ggplot(cis_enh_perts_plot,
       aes(x = fct_relevel(conf, "High confidence", "Low confidence"),
           y = abs(dist_to_tss) / 1000, fill = conf)) +
  geom_hline(yintercept = 1000, lty = "dashed") +
  geom_violin() +
  labs(y = "Distance to TSS (kb)") +
  scale_fill_manual(values = c("High confidence" = "goldenrod2", "Low confidence" = "darkslategray3", 
                               "Non significant" = "gray")) +
  scale_y_log10() +
  theme_bw() +
  theme(legend.position = "none", axis.title.x = element_blank(), text = element_text(size = 20),
        panel.grid = element_blank())

R Markdown ggplot2 From line 454 of analyses/map_enhancers.Rmd

# import gencode hg38 annotations
annot_url <- "ftp://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.chr.gtf.gz"
annot <- import(annot_url, format = "gtf")

# extract exon annotations for protein-coding genes
genes <- annot[annot$type == "exon" & annot$gene_biotype == "protein_coding" &
                 annot$transcript_biotype == "protein_coding"]

# only retain annotations on assessed chromosomes (maybe a little faster to overlap)
enh_chrs <- unique(cis_enh_perts$enh_chr)
genes <- genes[seqnames(genes) %in% enh_chrs]
genes <- split(genes, f = genes$gene_name)

R Markdown From line 485 of analyses/map_enhancers.Rmd

# calculate enhancer centers
enh_gene_pairs <- enh_gene_pairs %>%
  mutate(enh_center = round((enh_start + enh_end) / 2))

# create (sorted) interval between enhancer center and target gene tss
coords <- enh_gene_pairs %>%
  select(sample, perturbation, gene, gene_tss, enh_center) %>%
  gather(key = "type", value = "coord", -c(sample, perturbation, gene)) %>%
  group_by(sample, perturbation, gene) %>%
  summarize(start = sort(coord)[1],
            end = sort(coord)[2])

# merge with additional data
coords <- enh_gene_pairs %>%
  select(sample, perturbation, gene, chr = enh_chr, dist_to_tss, neg_log10_pvalue,
         prop_grna_hits, grna_hits) %>%
  right_join(coords, by = c("sample", "perturbation", "gene"))

# create GRanges object and overlap with protein-coding gene annotations
overlaps <- coords %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE) %>%
  findOverlaps(query = ., subject = genes)

# convert to list per hit
overlaps_list <- split(overlaps, f = from(overlaps))

# get unique overlapping gene names (genes that are skipped by the enhancer)
skipped_genes <- lapply(overlaps_list, FUN = function(x) {
  target_gene <- unique(coords[from(x), "gene"])
  all_genes <- names(genes[to(x)])
  all_genes[all_genes != target_gene]
})

# count number of skipped genes per association and add that number to data.frame
n_skipped <- sapply(skipped_genes, FUN = length)
coords$skipped_genes <- n_skipped

# compute histogram
skipped_hist <- coords %>%
  count(skipped_genes) %>%
  mutate(perc = n / sum(n))

# plot the number of skipped genes
total_value = sum(skipped_hist$n)
ggplot(skipped_hist, aes(x = skipped_genes, y = n)) +
  geom_bar(stat = "identity") +
  labs(x = "Genes skipped") +
  scale_y_continuous(sec.axis = sec_axis(~(./total_value), labels = scales::percent,
                                         name = "Percentage"), name = "Counts") +
  theme_bw() +
  theme(text = element_text(size = 15), panel.grid = element_blank())

# save plot for manuscript
ggsave(here("results/plots", "skipped_genes.pdf"), device = "pdf", width = 7, height = 5)

# plot association strength vs skipped genes
ggplot(coords, aes(x = skipped_genes, y = neg_log10_pvalue, color = prop_grna_hits)) +
  geom_point() +
  scale_color_gradient(low = "blue", high = "orangered") +
  labs(x = "Genes skipped", y = expression(-log[10] ~ p - value),
       title = "Association strength vs skipped genes", color = "Confidence\n(prop. gRNA hits)") +
  theme_bw() +
  theme(text = element_text(size = 15), panel.grid = element_blank())

# save plot for manuscript
ggsave(here("results/plots", "pvalue_vs_skipped_genes.pdf"), device = "pdf", width = 7, height = 5)

R Markdown ggplot2 From line 500 of analyses/map_enhancers.Rmd

# create GRanges object with enhancer center coordinates for every detected association
enh_coords <- enh_gene_pairs %>%
  select(sample, perturbation, gene, enh_chr, enh_center, dist_to_tss, neg_log10_pvalue,
         grna_hits, prop_grna_hits) %>%
  makeGRangesFromDataFrame(seqnames.field = "enh_chr", start.field = "enh_center",
                           end.field = "enh_center", keep.extra.columns = TRUE)

# function to get a genes TSS
get_tss <- function(x) {
  if (all(strand(x) == "+")) {
    tss <- min(start(x))
  }else if (all(strand(x) == "-")) {
    tss <- max(end(x))
  }else{
    warning("Inconsistent strand information for gene:", unique(x$gene_name), call. = FALSE)
    return(GRanges())
  }
  GRanges(seqnames = unique(seqnames(x)), ranges = IRanges(tss, tss), strand = unique(strand(x)))
}

# get TSS for all genes on chr8 and chr11
gene_tss <- unlist(GRangesList(lapply(genes, FUN = get_tss)))

# find nearest gene TSS to each enhancer
nearest_idx <- nearest(x = enh_coords, subject = gene_tss, ignore.strand = TRUE)
nearest_genes <- names(gene_tss[nearest_idx])

# add to associations
nearest_per_hit <- data.frame(mcols(enh_coords), nearest_gene = nearest_genes,
                              stringsAsFactors = FALSE)

# add variable to specify if closest gene ot the enhancer is the inferred target
nearest_per_hit <- nearest_per_hit %>%
  mutate(is_nearest = if_else(gene == nearest_gene, true = 1, false = 0))

# number and percentage of enhancers where the target gene is the nearest gene
n_nearest <- sum(nearest_per_hit$is_nearest)
perc_is_nearest <- mean(nearest_per_hit$is_nearest) * 100

# labels for x-axis for plot(s)
x_labels <- c(paste0("Nearest TSS (n = ", n_nearest,")"),
              paste0("Other TSS (n = ", nrow(nearest_per_hit) - n_nearest, ")"))

# title for plot
plot_title = paste0("Nearest TSS (", round(perc_is_nearest, digits = 1), "% of cases)")

# plot the number of pairs, where the target TSS is the closest TSS vs. not 
ggplot(nearest_per_hit, aes(x = fct_relevel(factor(is_nearest), "1"), fill = sample)) +
  geom_bar(position = "dodge") +
  labs(title = plot_title, y = "ETPs", fill = "Region") +
  scale_fill_manual(values = c("Chr11" = "indianred3", "Chr8" = "steelblue")) +
  scale_x_discrete(labels = x_labels) +
  theme_bw() +
  theme(axis.title.x = element_blank(), text = element_text(size = 13),
        panel.grid = element_blank())

R Markdown ggplot2 From line 572 of analyses/map_enhancers.Rmd

# compare p-value and confidence level between hits where the target gene is the nearest gene and
# hits where this is not the case
nearest_per_hit %>%
  select(is_nearest, neg_log10_pvalue, prop_grna_hits) %>%
  gather(key = "stat", value = "value", -is_nearest) %>%
  ggplot(., aes(x = factor(is_nearest), y = value)) +
  facet_wrap(~stat, ncol = 1, scales = "free") +
  geom_boxplot(fill = "steelblue3") +
  scale_x_discrete(labels = x_labels) +
  theme_bw() +
  theme(axis.title.x = element_blank(), panel.grid = element_blank())

R Markdown ggplot2 From line 632 of analyses/map_enhancers.Rmd

library(here)
library(rtracklayer)
library(tidyverse)

# prepare data -------------------------------------------------------------------------------------

# files containing differential expression results
result_files <- here(snakemake@input$results)

# set name for each file
samples <- basename(dirname(dirname(result_files)))
names(result_files) <- basename(result_files) %>%
  sub("output_MAST_", "", .) %>%
  sub("_.*", "", .) %>%
  paste(samples, ., sep = "_")

# read files and convert into one data.frame
results <- result_files %>%
  lapply(FUN = read.csv, stringsAsFactors = FALSE) %>%
  bind_rows(.id = "id") %>%
  separate(id, into = c("sample", "strategy"), sep = "_")

# files containing manually calculated log fold change
lfc_files <- here(snakemake@input$lfc)
names(lfc_files) <- basename(dirname(dirname(lfc_files)))

# load lfc files and convert into one data.frame
lfc <- lfc_files %>%
  lapply(FUN = read.csv, stringsAsFactors = FALSE) %>%
  bind_rows(.id = "sample")

# replace infinite values (pert or control is 0) by NA
lfc <- mutate(lfc, lfc = replace(lfc, is.infinite(lfc), NA))

# load target gene annotations
annot <- lapply(here(snakemake@input$annot), FUN = import, format = "gtf")

# merge into one GRanges object
annot <- annot %>%
  do.call("c", .) %>%
  unique()

# filter for exons of protein-coding genes (no processed transcripts etc)
annot <- annot[annot$type == "exon" &
                 annot$gene_biotype == "protein_coding" &
                 annot$transcript_biotype == "protein_coding"
               ]

# remove unlikely exons for HBE1 (extremely long introns, do not agree with RefSeq nor other HB*)
annot <- annot[!annot$exon_id %in% c("ENSE00001484269", "ENSE00001484268", "ENSE00001526635")]

# split by gene name into a GRangesList
genes <- split(annot, f = annot$gene_name)

# load enhancer coordinates files
enh_coords <- here(snakemake@input$enh_coords) %>%
  lapply( FUN = read.table, header = FALSE, stringsAsFactors = FALSE) %>%
  bind_rows() %>%
  distinct()

colnames(enh_coords) <- c("chr", "start", "end", "name", "score", "strand")


# FDR and confidence levels ------------------------------------------------------------------------

# correct for multiple testing across samples using FDR
results <- results %>%
  group_by(strategy) %>%
  rename(pval_adj_sample = pval_adj_allTests) %>%
  mutate(pval_adj_allTests = p.adjust(pvalue, method = "fdr"))

# extract results for 'per enhancer' tests
enh_perts <- filter(results, strategy == "perEnh")

# function to collapse gRNA ids per targeted enhancer
collapse_grnas <- function(grnas) {
  grnas <- sub("_.+", "", grnas)
  sub("\\.[A|B|C|D]", "", grnas)
}

# count number of gRNA hits per enhancer-gene pair
gRNA_hits <- results %>%
  filter(strategy == "perGRNA") %>%
  mutate(perturbation = collapse_grnas(perturbation)) %>%
  group_by(sample, perturbation, gene) %>%
  summarize(grna_hits = sum(pval_adj_allTests < snakemake@params$confidence_fdr),
            prop_grna_hits = mean(pval_adj_allTests < snakemake@params$confidence_fdr))

# add single gRNA hits to enhancer hits
enh_perts <- left_join(enh_perts, gRNA_hits, by = c("sample", "perturbation", "gene"))

# log fold change ----------------------------------------------------------------------------------

# add manually calculate log fold change to de results
enh_perts <- lfc %>%
  select(sample, perturbation, gene, manual_lfc = lfc) %>%
  left_join(enh_perts, ., by = c("sample", "perturbation", "gene"))

# add enhancer and gene TSS coordinates ------------------------------------------------------------

# compute center of every enhancer
enh_centers <- enh_coords %>%
  mutate(enh_center = round((start + end) / 2)) %>%
  select(name, chr, start, end, enh_center) %>%
  mutate(name = gsub(":|-", ".", name),
         chr = sub("chr", "", chr)) %>%
  rename(enh_chr = chr, enh_start = start, enh_end = end)

# add enhancer centers to de results
enh_perts <- enh_perts %>%
  left_join(enh_centers, by = c("perturbation" = "name"))

# get chromosome and strand for each gene
gene_chr <- unlist(unique(seqnames(genes)))
gene_strand <- unlist(unique(strand(genes)))

# function to get a genes TSS
get_tss <- function(x) {
  if (all(strand(x) == "+")) {
    min(start(x))
  }else if (all(strand(x) == "-")) {
    max(end(x))
  }else{
    stop("Inconsistent strand information!")
  }
}

# get TSS for each gene
gene_tss <- sapply(genes, FUN = get_tss)

# create data.frame with strand and tss coordinates for every gene
tss_coords <- data.frame(gene_chr, gene_tss, gene_strand, check.rows = TRUE) %>%
  rownames_to_column(var = "gene") %>%
  mutate_if(is.factor, as.character)

# add to discovery perturbation results
enh_perts <- left_join(enh_perts, tss_coords, by = "gene")


# add distance to TSS for cis associations ---------------------------------------------------------

# add type for cis or trans enhancer interactions
enh_perts <- enh_perts %>%
  mutate(enh_type = if_else(enh_chr == gene_chr, true = "cis", false = "trans"))

# calculate distance to tss for every cis enhancer - gene pair
enh_perts <- enh_perts %>%
  mutate(dist_to_tss = if_else(enh_type == "cis",
                               true = if_else(gene_strand == "+",
                                              true  = enh_center - gene_tss,
                                              false = gene_tss - enh_center),
                               false = as.numeric(NA)))


# label intronic ETPs ------------------------------------------------------------------------------

# create GRangesList object with gene locus coordinates (overlapping enhancers should be intronic)
gene_loci <- range(genes)

# create GRanges object with coordinates of all cis-enhancers (no promoter controls)
cis_enh_coords <- enh_perts %>% 
  ungroup() %>% 
  filter(enh_type == "cis") %>% 
  select(chr = enh_chr, start = enh_start, end = enh_end, perturbation) %>%
  distinct() %>% 
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# overlap enhancers with gene loci
overlaps <- findOverlaps(cis_enh_coords, gene_loci, ignore.strand = TRUE)

# covert to data.frame with overlapping gene names for every enhancers
enh_gene_overlaps <- data.frame(perturbation = cis_enh_coords[queryHits(overlaps)]$perturbation,
                                gene = names(gene_loci[subjectHits(overlaps)]),
                                stringsAsFactors = FALSE)

# create nested data.frame with all overlapping genes per perturbation
enh_gene_overlaps <- enh_gene_overlaps %>% 
  group_by(perturbation) %>% 
  nest(enh_overlapping_genes = gene)

# add to results and add column indicating intronic ETPs
enh_perts <- enh_perts %>% 
  left_join(enh_gene_overlaps, by = "perturbation") %>% 
  rowwise() %>% 
  mutate(enh_in_intron = if_else(gene %in% enh_overlapping_genes$gene, true = 1, false = 0))


# export processed DE results ----------------------------------------------------------------------

# reformat for output
enh_perts <- select(enh_perts, -c(enh_center, enh_overlapping_genes))

# save processed output to file
write.csv(enh_perts, file = here(snakemake@output), row.names = FALSE)

R tidyverse here From line 5 of analyses/process_de_results.R

library(tidyverse)
library(here)

# get tap-seq target genes -------------------------------------------------------------------------

# load tap-seq target genes
target_genes <- read.csv(here(snakemake@input$target_genes), stringsAsFactors = FALSE)

# known enhancer target genes
known_enh <- target_genes %>%
  filter(screen == "validation") %>%
  pull(gene)

# chromosome 11 inhibition panel
chr11_genes <- target_genes %>%
  filter(screen == "inhibition", panel == "chr11_hs2") %>%
  pull(gene) %>%
  c(known_enh)

# chromosome 8 inhibition panel
chr8_genes <- target_genes %>%
  filter(screen == "inhibition") %>%
  filter(panel %in% c("chr8_zfpm2", "chr8_myc")) %>%
  pull(gene) %>%
  c(known_enh)

# string patterns for all target genes (incl CROP-seq vectors)
vector_pattern <- snakemake@params$vector_pattern
targets_chr11 <- c(chr11_genes, vector_pattern)
targets_chr8 <- c(chr8_genes, vector_pattern)

# total number of reads ----------------------------------------------------------------------------

# fastq files containing genome reads of each sample
fastq_files <- snakemake@input$fastq

# infer sample names from fastq files and set as names
bn <- sub(".*lane1", "", basename(fastq_files))
names(fastq_files) <- sub("_2_sequence.txt.gz", "", bn)

# count total reads in fastq files
raw_reads <- sapply(X = fastq_files, FUN = function(x) {
  command <- paste("echo $(zcat", x, "| wc -l) / 4 | bc")
  as.numeric(system(command, intern = TRUE))
})

# convert to data.frame
raw_reads <- data.frame(sample = names(raw_reads), total_reads = raw_reads,
                        stringsAsFactors = FALSE)

# reads on target ----------------------------------------------------------------------------------

# function to count reads on target genes
reads_on_target <- function(bam_files, targets, bam_tag = "GE:Z:") {

  # add bam tag to target genes to create patterns for grep and write to text file
  targets_patterns <- paste0(bam_tag, targets)
  targets_file <- tempfile(pattern = "target_patterns_", fileext = ".txt")
  write(targets_patterns, file = targets_file)

  # count reads on targets
  target_reads <- vapply(bam_files, FUN = function(x) {
    command <- paste("samtools view", x, "| fgrep -f", targets_file, "| wc -l")
    as.numeric(system(command, intern = TRUE))
  }, FUN.VALUE = numeric(1))

  # delete target genes tempfile
  unlink(targets_file)

  # create final output
  data.frame(sample = names(bam_files), target_reads = target_reads, stringsAsFactors = FALSE)

}

# get bam files
bam_files <- here(snakemake@input$bam)
names(bam_files) <- basename(dirname(bam_files))

# bam files for chr11 and chr8 targets
chr11_bam <- c(bam_files[grep(names(bam_files), pattern = "^11")], bam_files["Sample10X"])
chr8_bam <- c(bam_files[grep(names(bam_files), pattern = "^8")], bam_files["Sample10X"])

# count reads mapping to target genes
chr11_target_reads <- reads_on_target(chr11_bam, targets = targets_chr11)
chr8_target_reads <- reads_on_target(chr8_bam, targets = targets_chr8)

# finalize output and save to file -----------------------------------------------------------------

# combine read counts into data frame and compute percentage of reads on target genes
read_counts <- bind_rows(chr11 = chr11_target_reads, chr8 = chr8_target_reads, .id = "panel") %>%
  left_join(raw_reads, by = "sample") %>%
  mutate(perc = target_reads / total_reads)

# save to output file
write.csv(read_counts, file = here(snakemake@output), row.names = FALSE)

R SAMtools tidyverse here From line 5 of analyses/reads_on_target.R

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 16 of analyses/screen_data_qc.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# attach required packages
library(factoextra)
library(grid)
library(gridExtra)
library(ggrepel)
library(GGally)
library(here)
library(tidyverse)

R Markdown tidyverse ggrepel gridExtra here GGally factoextra From line 23 of analyses/screen_data_qc.Rmd

# load target genes
target_genes <- read.csv(here(snakemake@input$target_genes), stringsAsFactors = FALSE)

# load dge summary
dge_stats_files <- here(snakemake@input$dge_stats)
names(dge_stats_files) <- basename(dirname(dge_stats_files))
dge_stats <- lapply(dge_stats_files, FUN = read.table, header = TRUE, stringsAsFactors = FALSE)

# load dge data
dge_files <- here(snakemake@input$dge)
names(dge_files) <- basename(dirname(dge_files))
dge <- lapply(dge_files, FUN = read.table, header = TRUE, stringsAsFactors = FALSE,
              row.names = "GENE")

# load perturbation status
perturb_status_files <- here(snakemake@input$perturb_status)
names(perturb_status_files) <- basename(dirname(perturb_status_files))
perturb_status <- lapply(perturb_status_files, FUN = read.table, header = TRUE,
                         row.names = "VECTOR", stringsAsFactors = FALSE)

# dge input files from validation TAP-seq experiments
valid_dge_files <- grep(snakemake@input$valid_dge, pattern = "iv2", value = TRUE)
valid_dge_files <- here(valid_dge_files)
names(valid_dge_files) <- basename(dirname(dirname(valid_dge_files)))

# load validation dge data
valid_dge <- lapply(valid_dge_files, FUN = read.table, header = TRUE, stringsAsFactors = FALSE)

# load experimental meta data and add sample information
exp_data <- here(snakemake@input$exp_data) %>%
  read.csv(stringsAsFactors = FALSE) %>%
  mutate(sample = paste0(sub("chr", "", panel), "Screen1"))

# transfected vector files
vector_files <- here(snakemake@input$vctr_seqs)
names(vector_files) <- sub("cropseq_vectors_(chr.+)_screen.fasta", "\\1", basename(vector_files))

# load transfected vectors and extract sample ids for both chromosomal regions
vectors <- vector_files %>%
  lapply(FUN = readLines) %>%
  lapply(FUN = grep, pattern = "^>.+$", perl = TRUE, value = TRUE) %>%
  lapply(FUN = function(x) data.frame(vector = x, stringsAsFactors = FALSE) ) %>%
  bind_rows(.id = "sample") %>%
  mutate(vector = sub(vector, pattern = ">", replacement = ""),
         sample = paste0(sub("chr", "", sample), "iScreen1"))

# extract vector expression from dge data
vector_pattern <- snakemake@params$vector_pattern
vector_rows <- lapply(dge, FUN = function(x) grep(rownames(x), pattern = vector_pattern) )
vctr_expr <- mapply(FUN = function(x, y) x[y, ], x = dge, y = vector_rows)

# extract gene expression data
gene_expr <- mapply(FUN = function(x, y) x[-y, ], x = dge, y = vector_rows)

# remove dge to free up memory
rm(dge)

R Markdown From line 37 of analyses/screen_data_qc.Rmd

# combine dge_stats into one data.frame and split cell barcode into i7 and cell
dge_stats <- dge_stats %>%
  bind_rows(.id = "sample") %>%
  separate(col = cell_barcode, into = c("i7_index", "cell"), sep = 8)

# add variable for 10x lane
dge_stats <- left_join(dge_stats, select(exp_data, sample, name, i7_index),
                       by = c("sample", "i7_index"))

# plot number of cells per lane
p1 <- ggplot(dge_stats, aes(x = name, fill = name)) +
  facet_wrap(~sample, scales = "free_x") +
  geom_bar() +
  theme_bw() +
  labs(x = "10x lane", y = "Cells", title = "Number of cells per 10x lane") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

# plot number of detected genes per cell
p2 <- ggplot(dge_stats, aes(x = name, y = genes, color = name)) +
  facet_wrap(~sample, scales = "free_x") +
  geom_jitter(width = 0.1, alpha = 0.2) +
  geom_boxplot(color = "black", fill = NA, outlier.shape = NA, notch = TRUE) +
  labs(x = "10x lane", y = "Detected genes", title = "Detected genes per cell") +
  scale_y_log10() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

# print plots
grid.arrange(p1, p2, nrow = 2, ncol = 1)

R Markdown ggplot2 From line 105 of analyses/screen_data_qc.Rmd

# plot number of transcripts vs genic reads
p1 <- ggplot(dge_stats, aes(x = transcripts, y = genic_reads, color = name)) +
  facet_wrap(~sample) +
  geom_point() +
  labs(x = "Transcripts", y = "Genic reads", title = "Transcripts vs. genic reads",
       color = "10x lane") +
  scale_x_log10() +
  scale_y_log10() +
  theme_bw() +
  theme(legend.position = "none")

# plot number of genic reads per cell
p2 <- ggplot(dge_stats, aes(x = name, y = genic_reads, color = name)) +
  facet_wrap(~sample, scales = "free_x", ncol = 2) +
  geom_jitter(width = 0.1, alpha = 0.2) +
  geom_boxplot(color = "black", fill = NA, outlier.shape = NA, notch = TRUE) +
  labs(x = "10x lane", y = "Genic reads", title = "Genic reads per cell") +
  scale_y_log10() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

# plot number of transcripts per cell
p3 <- ggplot(dge_stats, aes(x = name, y = transcripts, color = name)) +
  facet_wrap(~sample, scales = "free_x", ncol = 2) +
  geom_jitter(width = 0.1, alpha = 0.2) +
  geom_boxplot(color = "black", fill = NA, outlier.shape = NA, notch = TRUE) +
  labs(x = "10x lane", y = "Transcripts", title = "Transcripts per cell") +
  scale_y_log10() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")

# print plots
layout <- rbind(c(1, 1), c(2, 2), c(3, 3))
grid.arrange(p1, p2, p3, layout_matrix = layout)

R Markdown ggplot2 From line 141 of analyses/screen_data_qc.Rmd

# transpose gene expression, convert to long format and combine into one data.frame
gene_expr_l <- gene_expr %>%
  lapply(FUN = function(x) as.data.frame(t(x))) %>%
  lapply(FUN = rownames_to_column, var = "cell_barcode") %>%
  lapply(FUN = gather, key = "gene", value = "txs", -cell_barcode) %>%
  bind_rows(.id = "sample")

# split barcode into i7 index and cell barcode, and add variable for 10x lane
gene_expr_l <- gene_expr_l %>%
  separate(col = cell_barcode, into = c("i7_index", "cell"), sep = 8) %>%
  left_join(select(exp_data, sample, name, i7_index), by = c("sample", "i7_index"))

# compute average gene expression per 10x lane
avg_genex_lanes <- gene_expr_l %>%
  group_by(sample, name, gene) %>%
  summarize(avg_txs = mean(txs))

# plot average gene expression
ggplot(avg_genex_lanes, aes(x = name, y = avg_txs, color = name)) +
  facet_wrap(~sample, scales = "free_x", ncol = 2) +
  geom_jitter(width = 0.25, alpha = 0.5) +
  geom_boxplot(color = "black", fill = NA, notch = TRUE, outlier.shape = NA) +
  labs(x = "10x lane", y = "Average transcripts per gene", title = "Transcripts per gene") +
  scale_y_log10() +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "none")

R Markdown ggplot2 From line 182 of analyses/screen_data_qc.Rmd

# calculate log10 average gene expression and reformat
avg_genex_log10 <- mutate(avg_genex_lanes, avg_txs = log10(avg_txs))

# split by sample and reformat
avg_genex_log10 <- avg_genex_log10 %>%
  split(f = .$sample) %>%
  lapply(spread, key = "name", value = "avg_txs")

# plot pairwise correlations between 10x lanes
plots <- lapply(avg_genex_log10, FUN = function(x) ggpairs(x, columns = 3:ncol(x),
           title = unique(x$sample), progress = FALSE) )

# print plots
for (plot in plots) print(plot)

R Markdown From line 215 of analyses/screen_data_qc.Rmd

# convert gene expression into data matrices and transpose for pca
genex_mat <- lapply(gene_expr, FUN = function(x) t(data.matrix(x)) )

# create experimental meta data for every cell
exp_data_cells <- genex_mat %>%
  lapply(FUN = function(x) data.frame(cell = rownames(x), stringsAsFactors = FALSE) ) %>%
  bind_rows(.id = "sample") %>%
  mutate(i7_index = substr(cell, start = 1, stop = 8)) %>%
  left_join(select(exp_data, sample, i7_index, chip_10x, lane_10x, bead_vial_10x, name),
            by = c("sample", "i7_index")) %>%
  rename(lane_number_10x = lane_10x, sample_id_10x = name) %>%
  split(f = .$sample)

# perform PCA on DGE data
pca_genex <- lapply(genex_mat, FUN = prcomp, scale. = TRUE)

# function to plot pca colored by experimental variable 
plot_pca <- function(var, pca_obj, title = NULL) {
  fviz_pca_ind(pca_obj, habillage = var, label = "none", invisible = "quali", title = title)
}

# plot pcs 1 & 2 colored by experimental meta data
pca_plots_chr11 <- lapply(colnames(exp_data_cells[["11iScreen1"]][, -c(1:3)]), FUN = function(x) {
  plot_pca(var = exp_data_cells[["11iScreen1"]][, x], pca_obj = pca_genex[["11iScreen1"]],
           title = x)
})

pca_plots_chr8 <- lapply(colnames(exp_data_cells[["8iScreen1"]][, -c(1:3)]), FUN = function(x) {
  plot_pca(var = exp_data_cells[["8iScreen1"]][, x], pca_obj = pca_genex[["8iScreen1"]], title = x)
})

# print plots
grid.arrange(grobs = pca_plots_chr11, top = textGrob("11iScreen1", gp = gpar(fontsize = 15)))
grid.arrange(grobs = pca_plots_chr8,  top = textGrob("8iScreen1", gp = gpar(fontsize = 15)))

R Markdown From line 238 of analyses/screen_data_qc.Rmd

# number of transcripts per vector
txs_per_vector <- vctr_expr %>%
  lapply(FUN = rowSums) %>%
  lapply(FUN = function(x) data.frame(vector = names(x), txs = x, stringsAsFactors = FALSE)) %>%
  bind_rows(.id = "sample") %>%
  mutate(vector = sub(vector_pattern, "", vector))

# compute the number of cells per perturbation
cells_per_vector <- perturb_status %>%
  lapply(FUN = rowSums) %>%
  lapply(FUN = function(x) data.frame(vector = names(x), cells = x, stringsAsFactors = FALSE)) %>%
  bind_rows(.id = "sample")

# merge transcripts and cells per vector and add vectors that were not detected
vector_stats <- full_join(txs_per_vector, cells_per_vector, by = c("sample", "vector")) %>%
  full_join(vectors, by = c("sample", "vector")) %>%
  replace_na(replace = list(txs = 0, cells = 0))

# calculate average vector txs per transfected cell (if possible)
vector_stats <- vector_stats %>%
  mutate(avg_txs = if_else(cells > 0, true = txs / cells, false = 0))

# add label for control vectors and convert to long format
vector_stats <- vector_stats %>%
  mutate(type = if_else(grepl(vector, pattern = "^chr.+$"), true = "screen", false = "ctrl"))

R Markdown From line 280 of analyses/screen_data_qc.Rmd

# reformat vector_stats for plotting
vector_stats_plot <- vector_stats %>%
  gather(key = "stat", value = "value", -c(sample, vector, type)) %>%
  mutate(vector = paste(sample, vector, stat, sep = "_")) %>%
  mutate(vector = fct_reorder(vector, value, .desc = TRUE)) %>%
  mutate(stat = case_when(stat == "txs" ~ "Transcripts per vector",
                          stat == "cells" ~ "Cells per vector",
                          stat == "avg_txs" ~ "Average transcripts per cell"))

# plot vector transcripts and cells for chromosome 11
p1 <- vector_stats_plot %>%
  filter(sample == "11iScreen1") %>%
  ggplot(., aes(x = value)) +
  facet_wrap(~stat, scales = "free", ncol = 1) +
  geom_histogram(bins = 50) +
  labs(x = "Transcripts / Cells", title = "11iScreen1") +
  theme_bw()

p2 <- vector_stats_plot %>%
  filter(sample == "11iScreen1") %>%
  ggplot(., aes(x = vector, y = value, color = type)) +
  facet_wrap(~stat, scales = "free", ncol = 1) +
  geom_point() +
  geom_rug(color = "black", alpha = 0.1, sides = "l") +
  scale_color_manual(values = c("ctrl" = "firebrick2", "screen" = "steelblue3")) +
  labs(x = "Vector", y = "Transcripts / Cells", title = "", color = "Vector type") +
  scale_x_discrete(expand = c(0.06, 0)) +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid = element_blank())

layout <- rbind(c(1, 1, 2, 2, 2))
grid.arrange(p1, p2, ncol = 2, layout_matrix = layout)

R Markdown ggplot2 From line 308 of analyses/screen_data_qc.Rmd

# plot vector transcripts and cells for chromosome 8
p1 <- vector_stats_plot %>%
  filter(sample == "8iScreen1") %>%
  ggplot(., aes(x = value)) +
  facet_wrap(~stat, scales = "free", ncol = 1) +
  geom_histogram(bins = 50) +
  labs(x = "Transcripts / Cells", title = "8iScreen1") +
  theme_bw()

p2 <- vector_stats_plot %>%
  filter(sample == "8iScreen1") %>%
  ggplot(., aes(x = vector, y = value, color = type)) +
  facet_wrap(~stat, scales = "free", ncol = 1) +
  geom_point() +
  geom_rug(color = "black", alpha = 0.1, sides = "l") +
  scale_color_manual(values = c("ctrl" = "firebrick2", "screen" = "steelblue3")) +
  labs(x = "Vector", y = "Transcripts / Cells", title = "", color = "Vector type") +
  scale_x_discrete(expand = c(0.06, 0)) +
  theme_bw() +
  theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), panel.grid = element_blank())

layout <- rbind(c(1, 1, 2, 2, 2))
grid.arrange(p1, p2, ncol = 2, layout_matrix = layout)

R Markdown ggplot2 From line 343 of analyses/screen_data_qc.Rmd

# compute average gene expression per gene
avg_genex <- gene_expr_l %>%
  group_by(sample, gene) %>%
  summarize(avg_txs = mean(txs))

# get target genes for both samples
sample_targets_chr11 <- target_genes %>%
  filter((panel == "chr11_hs2" | screen == "validation"), !gene %in% c("HBG1", "HBG2")) %>%
  mutate(sample = "11iScreen1")

sample_targets_chr8 <- target_genes %>%
  filter((panel %in% c("chr8_myc", "chr8_zfpm2") | screen == "validation"),
         !gene %in% c("HBG1", "HBG2")) %>%
  mutate(sample = "8iScreen1")

sample_targets <- bind_rows(sample_targets_chr11, sample_targets_chr8) %>%
  select(gene, sample, type = screen) %>%
  mutate(type = if_else(type == "validation", true = "e-gene", false = "non_e-gene"))

# average gene expression in validation experiments
avg_genex_valid <- valid_dge %>%
  lapply(FUN = gather, key = "cell", value = "txs", -GENE) %>%
  bind_rows(.id = "sample") %>%
  mutate(sample = paste0(sub("iv2.*", "", sample), "iScreen1")) %>%
  rename(gene = GENE) %>%
  right_join(sample_targets, by = c("sample", "gene")) %>%
  group_by(sample, gene, type) %>%
  summarize(avg_txs_valid = mean(txs))

# merge with screen avg genex datasets
avg_genex_merged <- full_join(avg_genex, avg_genex_valid, by = c("sample", "gene"))

# correlation plot
ggplot(avg_genex_merged, aes(x = avg_txs_valid, y = avg_txs, color = type)) +
  facet_wrap(~sample) +
  geom_abline() +
  geom_point() +
  geom_text_repel(data = filter(avg_genex_merged, type == "e-gene" | gene %in% c("STK3", "PGAP2")),
                  aes(label = gene), box.padding = unit(3, "lines"), color = "gray50") +
  labs(title = "Average gene expression", x = "Transcripts validation", y = "Transcripts screen",
       color = "Type") +
  scale_color_manual(values = c("e-gene" = "firebrick2", "non_e-gene" = "steelblue3")) +
  scale_x_log10() +
  scale_y_log10() +
  coord_fixed() +
  theme_bw()

R Markdown ggplot2 From line 377 of analyses/screen_data_qc.Rmd

# only retain perturbation status on control perturbations
ctrl_perts <- perturb_status %>%
  lapply(FUN = rownames_to_column, var = "vector") %>%
  lapply(FUN = function(x) x[!grepl(x$vector, pattern = "^chr.+$"), ] ) %>%
  lapply(FUN = function(x) x[, c(TRUE, colSums(x[, -1]) > 0)] ) %>%  # remove cells without any pert
  lapply(FUN = gather, key = "cell_barcode", value = "pert", -vector) %>%
  bind_rows(.id = "sample") %>%
  filter(pert > 0)  # only retain detected perturbation events

# convert vector ids to grna targets
ctrl_perts <- ctrl_perts %>%
  mutate(vector = sub("_.+", "", vector),
         vector = sub("-[ABCD]", "", vector)) %>%
  rename(target = vector)

# collapse vector perturbations
ctrl_perts <- ctrl_perts %>%
  group_by(sample, target, cell_barcode) %>%
  summarize(pert = sum(pert))

R Markdown From line 430 of analyses/screen_data_qc.Rmd

# get gene expression data for cells carrying control perturbations
genex_pert <- gene_expr_l %>%
  unite(col = "cell_barcode", i7_index, cell, sep = "") %>%
  right_join(ctrl_perts, by = c("sample", "cell_barcode"))

# calculate average gene expression of each gene in every perturbation
avg_genex_pert <- genex_pert %>%
  group_by(sample, target, gene) %>%
  summarize(avg_txs = mean(txs)) %>%
  ungroup()

# extract negative control expression for every gene
neg_ctrl <- avg_genex_pert %>%
  filter(target == "non-targeting") %>%
  select(sample, gene, neg_ctrl = avg_txs)

# add to perturbed average expression levels
avg_genex_pert <- avg_genex_pert %>%
  filter(target != "non-targeting") %>%
  left_join(neg_ctrl, by = c("sample", "gene"))

# calculate log fold change for each gene in each perturbation
lfc_pert <- avg_genex_pert %>%
  filter(avg_txs > 0) %>%  # filter for genes with detected expression for the given perturbations
  mutate(fc = avg_txs / neg_ctrl,
         lfc = log2(fc))

R Markdown From line 452 of analyses/screen_data_qc.Rmd

# list of perturbations and their target genes
perturbations <- as.list(unique(lfc_pert$target))
names(perturbations) <- unlist(perturbations)
perturbations$HS2 <- c("HBB", "HBD", "HBE1", "HBG1", "HBG2")

# add variable indicating if a gene is a target of the given perturbation
lfc_pert <- lfc_pert %>%
  group_by(target) %>%
  mutate(type = if_else(gene %in% unlist(perturbations[target]),
                        true = "target", false = "non_target"))

# function to plot perturbation effects for one sample
plot_pert_effects <- function(lfc_pert, title = NULL) {
  ggplot(lfc_pert, aes(x = lfc, y = gene, color = type)) +
    facet_wrap(~target, ncol = 3) +
    geom_vline(xintercept = c(-1, 1), lty = "dashed", color = "darkgray") +
    geom_vline(xintercept = 0, color = "darkgray") +
    geom_point() +
    geom_text_repel(data = filter(lfc_pert, abs(lfc) > 0.75), aes(label = gene),
                    box.padding = unit(1.5, "lines"), color = "gray50") +
    labs(title = title, x = "Expression log fold change") +
    scale_color_manual(values = c("target" = "firebrick2", "non_target" = "steelblue3")) +
    theme_bw() +
    theme(axis.text.y = element_text(size = 5))
}

plot_pert_effects(filter(lfc_pert, sample == "11iScreen1"), title = "Perturbation LFC 11iScreen1")
plot_pert_effects(filter(lfc_pert, sample == "8iScreen1"), title = "Perturbation LFC 8iScreen1")

R Markdown ggplot2 From line 481 of analyses/screen_data_qc.Rmd

# get average gene expression of non_targets that show a |lfc| > 0.75
potential_fp <- filter(lfc_pert, type == "non_target", abs(lfc) > 0.75) %>%
  ungroup() %>%
  select(sample, gene) %>%
  distinct() %>%
  mutate(type = "False positive?")

# get average expression over all cells and all genes and mark potential false positives
avg_genex_fp <- avg_genex %>%
  left_join(potential_fp, by = c("sample", "gene")) %>%
  mutate(type = replace_na(type, "Other"))

# plot average expression of these genes vs other gene
p1 <- ggplot(avg_genex_fp, aes(x = type, y = avg_txs, fill = type, color = type)) +
  facet_wrap(~sample, scales = "free_y") +
  geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 0.15) +
  geom_text_repel(data = filter(avg_genex_fp, type == "False positive?"), aes(label = gene),
                  color = "gray50") +
  labs(title = "Average expression potential false positives", y = "Average expression") +
  scale_color_manual(values = c("False positive?" = "firebrick2", "Other" = "steelblue3")) +
  scale_fill_manual(values = c("False positive?" = "firebrick2", "Other" = "steelblue3")) +
  scale_y_log10() +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none")

# compute detection rate for each gene
detection_rate <- gene_expr_l %>%
  group_by(sample, gene) %>%
  summarize(cells = sum(txs > 0), 
            detect = cells / n()) %>%
  left_join(potential_fp, by = c("sample", "gene")) %>%
  mutate(type = replace_na(type, "Other"))

# plot proportion of cells with zero expression
p2 <- ggplot(detection_rate, aes(x = type, y = detect, fill = type, color = type)) +
  facet_wrap(~sample, scales = "free_y") +
  geom_dotplot(binaxis = "y", stackdir = "center", binwidth = 0.034) +
  geom_text_repel(data = filter(detection_rate, type == "False positive?"), aes(label = gene),
                  color = "gray50") +
  labs(title = "Detection rate", y = " % cells with transcripts > 0") +
  scale_color_manual(values = c("False positive?" = "firebrick2", "Other" = "steelblue3")) +
  scale_fill_manual(values = c("False positive?" = "firebrick2", "Other" = "steelblue3")) +
  scale_y_continuous(labels = scales::percent) +
  theme_bw() +
  theme(axis.title.x = element_blank(), legend.position = "none")

# print plots
grid.arrange(p1, p2, ncol = 1)

R Markdown ggplot2 From line 518 of analyses/screen_data_qc.Rmd

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 17 of analyses/screen_grna_synthesis_errors.Rmd

# attach required packages
library(here)
library(Biostrings)
library(tidyverse)

R Markdown tidyverse here From line 24 of analyses/screen_grna_synthesis_errors.Rmd

# perturbation status
pert_8iScreen1 <- read.table(here("data/8iScreen1/perturb_status.txt"), header = TRUE,
                             stringsAsFactors = FALSE)
pert_11iScreen1 <- read.table(here("data/11iScreen1/perturb_status.txt"), header = TRUE,
                             stringsAsFactors = FALSE)

# convert perturbation status to long format and only keep detected perturbations
pert_8iScreen1 <- pert_8iScreen1 %>% 
  pivot_longer(-VECTOR, names_to = "cell_barcode", values_to = "pert") %>% 
  filter(pert > 0) %>% 
  select(-pert) %>% 
  dplyr::rename(vector_id = VECTOR)

pert_11iScreen1 <- pert_11iScreen1 %>% 
  pivot_longer(-VECTOR, names_to = "cell_barcode", values_to = "pert") %>% 
  filter(pert > 0) %>% 
  select(-pert) %>% 
  dplyr::rename(vector_id = VECTOR)

# files containing parsed misatch data
outfile_8iScreen1 <- here("data/8iScreen1/cropseq_vector_seqs.txt.gz")
outfile_11iScreen1 <- here("data/11iScreen1/cropseq_vector_seqs.txt.gz")

col_types <- cols(
  "cell_barcode" = col_character(),
  "umi_barcode" = col_character(),
  "aligned_vector_id" = col_character(),
  "mapping_quality" = col_integer(),
  "mismatches" = col_character(),
  "mismatch_positions" = col_character(),
  "sequence" = col_character()
)

# read mismatch data
mm_8iScreen1 <- read_tsv(outfile_8iScreen1, col_types = col_types)
mm_11iScreen1 <- read_tsv(outfile_11iScreen1, col_types = col_types)

R Markdown From line 34 of analyses/screen_grna_synthesis_errors.Rmd

# only retain reads with a mapping quality >= 10
mm_8iScreen1  <- dplyr::filter(mm_8iScreen1, mapping_quality >= 10)
mm_11iScreen1 <- dplyr::filter(mm_11iScreen1, mapping_quality >= 10)

# filter for reads aligning to inferred perturbations for each cell
mm_perts8 <- mm_8iScreen1 %>% 
  rename(vector_id = aligned_vector_id) %>% 
  mutate(cell_barcode = sub("XC:Z:", "", cell_barcode),
         vector_id = sub("GE:Z:CROPseq_dCas9_DS_", "", vector_id),
         mismatches = as.integer(sub("NM:i:", "", mismatches))) %>% 
  inner_join(pert_8iScreen1, by = c("cell_barcode", "vector_id"))

mm_perts11 <- mm_11iScreen1 %>% 
  rename(vector_id = aligned_vector_id) %>% 
  mutate(cell_barcode = sub("XC:Z:", "", cell_barcode),
         vector_id = sub("GE:Z:CROPseq_dCas9_DS_", "", vector_id),
         mismatches = as.integer(sub("NM:i:", "", mismatches))) %>% 
  inner_join(pert_11iScreen1, by = c("cell_barcode", "vector_id"))

R Markdown From line 80 of analyses/screen_grna_synthesis_errors.Rmd

# load vector sequences for chr8 and chr11 screen datasets
vectors8 <- readDNAStringSet(here("meta_data/cropseq_vectors/cropseq_vectors_chr8_screen.fasta"))
vectors11 <- readDNAStringSet(here("meta_data/cropseq_vectors/cropseq_vectors_chr11_screen.fasta"))

# convert to data.frame 
gRNAs8 <- data.frame(vector_id = names(vectors8), vector_seq = as.character(vectors8),
                     row.names = NULL, stringsAsFactors = FALSE)

gRNAs11 <- data.frame(vector_id = names(vectors11), vector_seq = as.character(vectors11),
                     row.names = NULL, stringsAsFactors = FALSE)

# extract gRNA sequences (19bp) from vector sequences
gRNAs8 <- gRNAs8 %>% 
  mutate(grna_seq = sub(".*TGTGGAAAGGACGAAACACCG", "", vector_seq),
         grna_seq = substr(grna_seq, start = 1, stop = 19))

gRNAs11 <- gRNAs11 %>% 
  mutate(grna_seq = sub(".*TGTGGAAAGGACGAAACACCG", "", vector_seq),
         grna_seq = substr(grna_seq, start = 1, stop = 19))

# extract gRNA sequences from vector reads, assuming gRNAs are 19bp long
mm_perts8  <- mutate(mm_perts8, grna_seq = substr(sequence, start = 22, stop = 40))
mm_perts11 <- mutate(mm_perts11, grna_seq = substr(sequence, start = 22, stop = 40))

R Markdown From line 103 of analyses/screen_grna_synthesis_errors.Rmd

# create consensus gRNA sequence for every detected (perturbations) gRNA for each cell
gRNA_consensus_chr8 <- mm_perts8 %>% 
  group_by(cell_barcode, vector_id) %>% 
  summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq)))

gRNA_consensus_chr11 <- mm_perts11 %>% 
  group_by(cell_barcode, vector_id) %>% 
  summarize(grna_consensus_seq = consensusString(DNAStringSet(grna_seq)))

# add expected gRNA sequence
gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% 
  left_join(select(gRNAs8, -vector_seq), by = "vector_id")

gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% 
  left_join(select(gRNAs11, -vector_seq), by = "vector_id")

R Markdown Consensus From line 133 of analyses/screen_grna_synthesis_errors.Rmd

# function to compute edit distance between consensus and expected gRNA sequences
calc_edit_dist <- function(x, y, method = "levenshtein") {
  as.integer(stringDist(c(x, y), method = method))
}

# calculate edit distance between sequenced consensus gRNA and expected gRNA sequence
gRNA_consensus_chr8 <- gRNA_consensus_chr8 %>% 
  rowwise() %>% 
  mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein"))

gRNA_consensus_chr11 <- gRNA_consensus_chr11 %>% 
  rowwise() %>% 
  mutate(edit_dist = calc_edit_dist(grna_consensus_seq, grna_seq, method = "levenshtein"))

noerr_chr8 <- mean(gRNA_consensus_chr8$edit_dist == 0)
noerr_chr11 <- mean(gRNA_consensus_chr11$edit_dist == 0)

R Markdown Consensus From line 154 of analyses/screen_grna_synthesis_errors.Rmd

# create one table
gRNA_cons <- bind_rows(Chr8 = gRNA_consensus_chr8, Chr11 = gRNA_consensus_chr11, .id = "Sample")

# create histogram of edit distance distribution
ggplot(gRNA_cons, aes(edit_dist, fill = Sample)) +
  geom_histogram(position = "dodge", bins = 20) +
  labs(x = "Edit distance\n(gRNA consensus sequence - template)",
       y = "gRNA - cell combinations") +
  scale_fill_manual(values = c(Chr11 = "indianred3", Chr8 = "steelblue")) +
  theme_bw() +
  theme(panel.grid = element_blank(), aspect.ratio = 1,
        text = element_text(size = 15))

# save plot to .pdf file
ggsave(filename = here("results/plots", "screen_gRNA_errors.pdf"), height = 4, width = 5)

R Markdown ggplot2 Consensus From line 173 of analyses/screen_grna_synthesis_errors.Rmd

/* Define a margin before every image element */
img {
  margin-top: 3em;
}

R Markdown From line 15 of analyses/tapseq_vs_cropseq.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# attach required packages
library(tidyverse)
library(ggrepel)
library(here)

# create plots directory if needed
dir.create("results/plots", showWarnings = FALSE)

R Markdown tidyverse ggrepel here From line 22 of analyses/tapseq_vs_cropseq.Rmd

# load read counts
read_counts <- read.csv(here(snakemake@input$reads_on_target), stringsAsFactors = FALSE)

# add variable for approach and rename Sample10X to include assessed chromosome
read_counts_plot <- read_counts %>%
  mutate(approach = if_else(sample == "Sample10X", true = "cropseq", false = "tapseq")) %>%
  mutate(sample = if_else(sample == "Sample10X", true = paste0(sample, " (", panel, ")"),
                          false = sample)) %>%
  arrange(panel, desc(sample))

# plot percentage of total reads on target genes
ggplot(read_counts_plot, aes(x = fct_inorder(sample), y = perc, fill = approach)) +
  geom_bar(stat = "identity") +
  labs(y = "Percentage of reads mapping to target genes", x = NULL) +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "none") +
  scale_fill_manual(values =  c("tapseq" = "firebrick2", "cropseq" = "gray42"))

# figure caption
cap <- "Percentage of reads mapping to assessed target genes."

# save plot to file
ggsave(here("results/plots/reads_on_target.pdf"), device = "pdf", width = 5, height = 4)

R Markdown ggplot2 From line 39 of analyses/tapseq_vs_cropseq.Rmd

# load tap-seq target genes
target_genes <- read.csv(here(snakemake@input$target_genes), stringsAsFactors = FALSE)

# known enhancer targets for validation
known_enh <- target_genes %>%
  filter(screen == "validation") %>%
  pull(gene)

# get downsampled dge files and set names
ds_dge_files <- here(snakemake@input$dge)
names(ds_dge_files) <- ds_dge_files %>%
  sub("_avg_reads_per_cell.txt", "", .) %>%
  sub(".*data/", "", .) %>%
  sub("/downsampled/dge_", "_", .)

# load downsampled dge data and only retain chr8 and chr11 target genes, convert to long format and
# make one data.frame
ds_dge <- ds_dge_files %>%
  lapply(FUN = read.table, header = TRUE, stringsAsFactors = FALSE) %>%
  lapply(FUN = filter, GENE %in% target_genes$gene) %>%
  lapply(FUN = gather, key = "cell",  value = "txs", -GENE) %>%
  bind_rows(.id = "sample") %>%
  rename(gene = GENE)

# split sample into sample and sequencing depth, and add gene panel and experiment information
ds_dge <- ds_dge %>%
  separate(sample, into = c("sample", "avg_rpc"), sep = "_") %>%
  mutate(avg_rpc = as.numeric(avg_rpc)) %>%
  left_join(target_genes[, c("gene", "panel")], by = "gene") %>%
  mutate(panel = sub("_.+", "", panel)) %>%
  mutate(exp = case_when(sample %in% c("11iv210ng", "11iv22ng") ~ "chr11",
                         sample %in% c("8iv210ng", "8iv22ng") ~ "chr8",
                         TRUE ~ "cropseq"))

# only retain intended target gene expression for tap-seq samples
ds_dge <- ds_dge %>%
  mutate(sample_panel = paste0("chr", sub("iv2.+" ,"", sample))) %>%
  filter(sample_panel == panel | gene %in% known_enh | sample == "Sample10X") %>%
  select(-sample_panel)

R Markdown From line 74 of analyses/tapseq_vs_cropseq.Rmd

# only retain data on 3500 reads per cell downsampling
ds_dge_filt <- filter(ds_dge, avg_rpc == 3500)

# calculate average number transcripts per target gene and experiment
avg_txs <- ds_dge_filt %>%
  group_by(sample, exp, gene) %>%
  summarize(avg_txs = mean(txs)) %>%
  mutate(type = if_else(gene %in% known_enh, true = "e-gene", false = "non_e-gene")) %>%
  ungroup()

# extract average transcripts per gene for crop-seq
avg_txs_cropseq <- avg_txs %>%
  ungroup() %>%
  filter(exp == "cropseq") %>%
  select(gene, cropseq = avg_txs)

# add to tap-seq data and calculate log fold change
avg_txs_fc <- avg_txs %>%
  filter(exp != "cropseq") %>%
  left_join(avg_txs_cropseq, by = "gene") %>%
  mutate(fc = avg_txs / cropseq, lfc = log2(fc))

# axis limits
axis_limits <- range(select(avg_txs_fc, cropseq, avg_txs))

# create scatter plots
ggplot(avg_txs_fc, aes(x = cropseq, y = avg_txs, col = type)) + 
  facet_wrap(~sample, ncol = 2, nrow = 2) +
  geom_abline(slope = 1, color = "darkgray", size = 0.5) +
  geom_text_repel(data = filter(avg_txs_fc, type == "e-gene"), aes(label = gene),
                  box.padding = unit(1.5, "lines"), color = "gray50") +
  geom_point() +
  geom_smooth(method = lm,  se = FALSE, color = "steelblue") +
  scale_x_log10(limits = axis_limits) + 
  scale_y_log10(limits = axis_limits) +
  scale_color_manual(values = c("e-gene" = "firebrick2", "non_e-gene" = "gray42")) +
  labs(x = "Average transcripts per gene (CROP-seq)",
       y = "Average transcripts per gene (TAP-seq)") +
  coord_fixed() +
  theme_bw() +
  theme(legend.position = "none")

# figure caption
cap <- "Average transcripts per gene, TAP-seq vs. CROP-seq"

# save plot to file
ggsave(here("results/plots/avg_txs_per_gene.pdf"), device = "pdf", width = 7, height = 7)

R Markdown ggplot2 From line 116 of analyses/tapseq_vs_cropseq.Rmd

# axis limits
lim <- max(abs(avg_txs_fc$lfc))

# plot lfc per gene between methods
ggplot(avg_txs_fc, aes(x = sample, y = lfc, color = type)) +
  geom_hline(yintercept = 0, color = "darkgray", lty = "dashed", size = 1) +
  geom_jitter(width = 0.2, size = 1) +
  geom_boxplot(outlier.shape = NA, fill = NA, color = "gray42", notch = TRUE) +
  labs(x = "TAP-seq sample", y = "Log fold change TAP-seq / CROP-seq") +
  scale_color_manual(values = c("e-gene" = "firebrick2", "non_e-gene" = "gray42")) +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "none")

# figure caption
cap <- "Average LFC TAP-seq vs. CROP-seq"

# save plot to file
ggsave("results/plots/lfc_tapseq_vs_cropseq.pdf", device = "pdf", height = 4, width = 5)

R Markdown ggplot2 From line 168 of analyses/tapseq_vs_cropseq.Rmd

# calculate number of cells in which a gene is detected in each experiment
cells_genes <- ds_dge %>%
  group_by(exp, avg_rpc, gene) %>%
  summarize(avg_txs = mean(txs),
            perc_cells = mean(txs > 0))

# compute expression quantiles for all target genes based on average expression in CROP-seq
expr_quantiles <- cells_genes %>%
  ungroup() %>%
  filter(exp == "cropseq", avg_rpc == 20000) %>%
  mutate(quantile = ntile(avg_txs, 4)) %>%
  select(gene, quantile)

# add expression quantile to each gene
cells_genes <- left_join(cells_genes, expr_quantiles, by = "gene")

# reformat for plot
cells_genes_plot <- cells_genes %>%
  ungroup() %>%
  select(-avg_txs) %>%
  spread(key = "exp", value = "perc_cells") %>%
  gather(key = "exp", value = "tapseq", -c(avg_rpc, gene, quantile, cropseq)) %>%
  gather(key = "approach", value = "perc_cells", -c(avg_rpc, gene, quantile, exp)) %>%
  drop_na() %>%
  mutate(avg_rpc = factor(avg_rpc, levels = sort(unique(avg_rpc))),
         approach = fct_relevel(approach, "tapseq"))

# create boxplot comparing detection sensitivity between tap and crop-seq across sequencing depths
ggplot(cells_genes_plot, aes(x = as.factor(quantile), y = perc_cells)) +
  facet_wrap(~exp, ncol = 1, nrow = 2) +
  geom_boxplot(aes(linetype = approach, fill = avg_rpc), outlier.shape = NA) +
  labs(y = "Cells in which genes are detected", x = "CROP-seq expression quantiles",
       fill = "Avg reads per cell", linetype = "Method") +
  scale_y_continuous(labels = scales::percent) +
  theme_bw()

# figure caption
cap <- "Detection sensitivity TAP-seq vs. CROP-seq (all genes should be expressed in K562)"

# save plot to file
ggsave(here("results/plots/detection_sensitivity.pdf"), device = "pdf", height = 5, width = 7)

R Markdown ggplot2 From line 197 of analyses/tapseq_vs_cropseq.Rmd

library(GenomicAlignments)
library(GenomicFeatures)
library(here)
library(HiTC)
library(Matrix)
library(rtracklayer)
library(tidyverse)
library(BiocParallel)

# backend for parallel computing
register(MulticoreParam(workers = snakemake@threads))

# load processed differential expression output (includes confidence level and distance to TSS)
results <- here(snakemake@input$processed_results) %>%
  read.csv(stringsAsFactors = FALSE) %>%
  mutate(sample = paste0("chr", sub("iScreen1", "", sample)))

# extract cis-interactions on chromosome 8 and 11 and add column identifying significant hits
cis_enh_perts <- results %>%
  filter(enh_type == "cis", abs(dist_to_tss) >= 1000) %>%
  mutate(enh_chr = paste0("chr", enh_chr), gene_chr = paste0("chr", gene_chr),
         significant = if_else(pval_adj_allTests < 0.05, true = "sig", false = "non_sig"))

# remove validation controls on other chromosomes than the samples' target region and significant
# hits that increase gene expression
cis_enh_perts <- cis_enh_perts %>%
  filter(sample == enh_chr) %>%
  filter(!(significant == "sig" & manual_lfc > 0))

# Annotate chromatin ===============================================================================

# Every assessed enhancer is annotated for the number of chromatin assay ChIP-seq and DNAse-seq
# reads. The RPKM is calculated to normalize for both the sequencing depth and enhancer size.

# extract hg38 enhancer coordinates
enh_coords <- cis_enh_perts %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# get coordinates of genomic regions and extend by 10 kb
region_coords <- range(enh_coords)
start(region_coords) <- start(region_coords) - 10000
end(region_coords) <- end(region_coords) + 10000

# bam files containing aligned reads
bam_files <- here(snakemake@input$encode_bam)
names(bam_files) <- sub("_encode_rep1_alignment.bam", "", basename(bam_files))

# load aligned reads within the regions from bam files
chrom_reads <- bplapply(bam_files, FUN = readGAlignments,
                        param = ScanBamParam(which = region_coords))

# count the number of reads within each enhancer
enh_chrom_reads <- chrom_reads %>%
  bplapply(FUN = countOverlaps, query = enh_coords) %>%
  bind_cols()

# combine with enhancer coordinates and convert to long format
enh_chrom <- enh_coords %>%
  data.frame(stringsAsFactors = FALSE) %>%
  select(-strand) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  bind_cols(enh_chrom_reads) %>%
  gather(key = assay, value = reads, -c(1:5))

# normalize for sequencing depth and enhancer size by calculating rpkm
enh_chrom_norm <- enh_chrom %>%
  group_by(assay) %>%
  mutate(rpkm = reads / (sum(reads) / 1e6) / (width / 1000))

# convert to wide format add to enhancer gene pair results
cis_enh_perts <- enh_chrom_norm %>%
  select(-c(enh_chr, enh_start, enh_end, width, reads)) %>%
  spread(key = "assay", value = "rpkm") %>%
  left_join(x = cis_enh_perts, y = ., by = "perturbation")

# Annotate Hi-C ====================================================================================

# Hi-C contact matrices from Rao et al., 2014 are imported and the contact frequency for each
# enhancer - gene pair is calculated.

# Liftover enhancer and gene TSS coordinates -------------------------------------------------------

# Hi-C contact matrices are in hg19 annotation, so enhancer and gene TSS coordinates need to be
# lifted over from hg38 to hg19.

# download chain file for hg38 to hg19 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
chain_file <- tempfile("hg38ToHg19", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg38_to_hg19_chain <- import.chain(tools::file_path_sans_ext(chain_file))
closeAllConnections()  # close any open file connection

# extract hg38 enhancer coordinates
enh_coords_hg38 <- cis_enh_perts %>%
  select(perturbation, enh_chr, enh_start, enh_end) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "enh_chr",
                           start.field = "enh_start", end.field = "enh_end")

# liftover enhancers from hg38 to hg19 and convert to data.frame
enh_coords_hg19 <- enh_coords_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, end, perturbation) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end)

# replace tested enhancer - gene pairs hg38 enhancer coordinates with hg19 coordinates
cis_enh_perts <- cis_enh_perts %>%
  select(-c(enh_chr, enh_start, enh_end)) %>%
  left_join(enh_coords_hg19, by = "perturbation")

# extract gene TSS coordinates
gene_tss_hg38 <- cis_enh_perts %>%
  select(gene, gene_chr, gene_tss, gene_strand) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "gene_chr",
                           start.field = "gene_tss", end.field = "gene_tss",
                           strand.field = "gene_strand")

# liftover tss coordinates to hg19
gene_tss_hg19 <- gene_tss_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, strand, gene) %>%
  rename(gene_chr = seqnames, gene_tss = start, gene_strand = strand)

# replace gene tss hg38 coordinates with hg19 coordinates
cis_enh_perts <- cis_enh_perts %>%
  select(-c(gene_chr, gene_tss, gene_strand)) %>%
  left_join(gene_tss_hg19, by = "gene")

# recalculate distance to tss for hg19 coordinates
cis_enh_perts <- cis_enh_perts %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>%
  select(-enh_center)

# Prepare Hi-C data --------------------------------------------------------------------------------

# Contact matrices for K562 cells are imported (5kb resolution). The provided normalization vectors
# are used to normalize the observed contacts (read counts). Data from the two chromosonal regions
# on chromosome 8 and 11 is extracted for any further analyses.

# function to import HiC data from Rao et al for one chromosome and create a HTCexp object
import_hic <- function(sparse_cm_file, chromosome, resolution, bins) {

  # load sparse contact matrix file (only observed contacts)
  obs_contacts <- read.table(sparse_cm_file, col.names = c("i", "j", "M_ij"), sep = "\t")

  # get starting coordinates of assessed genomic bins at 5kb resolution
  max_bin <- (bins - 1) * resolution
  bin_starts <- seq(from = 0, to = max_bin, by = resolution)

  # create GRanges object containing all assessed bins for that chromosome
  bin_coords <- GRanges(seqnames = chromosome,
                        ranges = IRanges(start = bin_starts, end = bin_starts + resolution - 1,
                                         names = paste0("bin_", seq_len(length(bin_starts))))
  )

  # convert starting coordinates of bins in sparse matrix input to bin ids by dividing by the
  # resolution (and add 1 to get correct index)
  obs_contacts_bins <- data.frame(i = round(obs_contacts$i / resolution + 1),
                                  j = round(obs_contacts$j / resolution + 1), 
                                  M_ij = obs_contacts$M_ij)

  # create sparse contact matrix from observed contacts
  sparse_cm <- sparseMatrix(i = obs_contacts_bins$i, j = obs_contacts_bins$j,
                            x = obs_contacts_bins$M_ij, symmetric = TRUE, dims = c(bins, bins))

  # create HTCexp object containing data for the given chromosome
  HTCexp(intdata = sparse_cm, xgi = bin_coords, ygi = bin_coords)

}

# k562 intrachromosomal sparse matrix and krnorm vector files for chromosomes 8 and 11
scm_files <- here(snakemake@input$hic_raw)
names(scm_files) <- sub("_5kb.RAWobserved", "", basename(scm_files))

krnorm_files <- here(snakemake@input$hic_norm)
names(krnorm_files) <- sub("_5kb.KRnorm", "", basename(krnorm_files))

# import normalization vectors
chr8_KRnorm  <- as.numeric(readLines(krnorm_files["chr8"]))
chr11_KRnorm <- as.numeric(readLines(krnorm_files["chr11"]))

# infer number of bins per chromosome based on the normalization vectors
chr8_bins  <- length(chr8_KRnorm)
chr11_bins <- length(chr11_KRnorm)

# import hi-c data for these chromosomes
chr8_hic  <- import_hic(scm_files["chr8"],  chromosome = "chr8", resolution = 5000,
                        bins = chr8_bins)
chr11_hic <- import_hic(scm_files["chr11"], chromosome = "chr11", resolution = 5000,
                        bins = chr11_bins)

# function to normalize Hi-C data based on provided normalization vectors
normalize_hic <- function(htc_obj, norm_vector) {

  # extract raw observed interaction matrix
  raw_obs <- intdata(htc_obj)

  # create normalization matrix by computing the outer product of the normalization vector
  norm_mat <- outer(norm_vector, norm_vector)

  # multiply observed interactions by normalization matrix and add back to HTC object
  intdata(htc_obj) <- raw_obs / norm_mat
  return(htc_obj)

}

# normalize HiC data
chr8_hic_norm  <- normalize_hic(chr8_hic,  norm_vector = chr8_KRnorm)
chr11_hic_norm <- normalize_hic(chr11_hic, norm_vector = chr11_KRnorm)

# infer chromosomal region range
region_coords <- cis_enh_perts %>%
  select(sample, enh_start, enh_end, gene_tss) %>%
  gather(key = "key", value = "coord", -sample) %>%
  group_by(sample) %>%
  summarize(start = min(coord), end = max(coord))

# calculate bin coordinates that contain the entire regions
region_bins <- region_coords %>%
  mutate(start = floor(start / 5000) * 5000,
         end = ceiling(end / 5000) * 5000)

# extract data for assessed regions
chr8_region_hic <- extractRegion(chr8_hic_norm, MARGIN = c(1, 2), chr = "chr8",
                                 from = pull(filter(region_bins, sample == "chr8"), start),
                                 to = pull(filter(region_bins, sample == "chr8"), end))

chr11_region_hic <- extractRegion(chr11_hic_norm, MARGIN = c(1, 2), chr = "chr11",
                                  from = pull(filter(region_bins, sample == "chr11"), start),
                                  to = pull(filter(region_bins, sample == "chr11"), end))

# Enhancer - gene pairs ----------------------------------------------------------------------------

# The interaction frequency of all tested enhancer - gene pair computed, defined as the interaction
# frequency of the genomic bins overlapping the enhancer and the target genes TSS.

# function to compute the interaction frequency for enhancer - gene pairs, by finding the hi-c 
# genomic bins with overlap with the enhancer and the target gene tss. the interaction frequency of
# the pair is then defined as the interaction frequency of these bins
compute_int_freq <- function(pairs, htc_object) {

  # add pair identifier
  pairs$pair <- seq_len(nrow(pairs))

  # get coordinates of enhancer centers as GRanges object
  enh_coords <- pairs %>%
    mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
    select(enh_chr, enh_center) %>%
    makeGRangesFromDataFrame(., seqnames.field = "enh_chr", start.field = "enh_center",
                             end.field = "enh_center")

  # get gene tss coordinates as GRanges object
  tss_coords <- pairs %>%
    select(gene_chr, gene_tss) %>%
    makeGRangesFromDataFrame(., seqnames.field = "gene_chr", start.field = "gene_tss",
                             end.field = "gene_tss")

  # get bins overlapping for all enhancers and gene tss
  hic_bins <- x_intervals(htc_object)
  enh_bins <- findOverlaps(query = enh_coords, subject = hic_bins)
  tss_bins <- findOverlaps(query = tss_coords, subject = hic_bins)

  # combine into one data.frame
  enh_bins <- data.frame(pair = from(enh_bins), enh_bin = to(enh_bins))
  tss_bins <- data.frame(pair = from(tss_bins), tss_bin = to(tss_bins))
  bins <- full_join(enh_bins, tss_bins, by = "pair")

  # extract distance matrix between bins from htc object
  dists <- intervalsDist(htc_object)

  # calculate distances between bins of enhancer gene pairs
  dist_pairs <- dists[as.matrix(bins[, 2:3])]
  dist_pairs <- data.frame(pair = bins$pair, dist_bins = dist_pairs)

  # extract hi-c interaction matrix from htc object
  intdata <- intdata(htc_object)

  # get interaction frequencies for all bins and add pair id
  int_freq_pairs <- intdata[as.matrix(bins[, 2:3])]
  int_freq_pairs <- data.frame(pair = bins$pair, int_freq = int_freq_pairs)

  # add interaction frequencies and bin distances to pairs to create output
  pairs %>%
    left_join(dist_pairs, by = "pair") %>%
    left_join(int_freq_pairs, by = "pair") %>%
    select(-c(pair))

}

# compute interaction frequencies for all tested enhancer - gene pairs
chr8_pairs <- cis_enh_perts %>%
  filter(enh_chr == "chr8", enh_chr == "chr8") %>%
  compute_int_freq(., htc_object = chr8_region_hic)

chr11_pairs <- cis_enh_perts %>%
  filter(enh_chr == "chr11", enh_chr == "chr11") %>%
  compute_int_freq(., htc_object = chr11_region_hic)

# combine into one data.frame
pairs <- rbind(chr8_pairs, chr11_pairs)

# Activity by contact ==============================================================================

# The activity by contact score is computed as proposed by Fulco et al., 2019

# function to calculate geometric mean
gm_mean <- function(x, na.rm = TRUE){
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# compute enhancer activity score
pairs <- pairs %>%
  rowwise() %>%
  mutate(activity_score = gm_mean(c(`Dnase-seq`, H3K27ac)))

# compute abc score
pairs <- pairs %>%
  ungroup() %>% 
  group_by(sample, gene) %>%
  mutate(abc_score = activity_score * (int_freq + 1),
         abc_score_norm = abc_score / sum(abc_score))

# convert distance to TSS to absolute value and change label of significant hits
pairs <- pairs %>%
  mutate(dist_to_tss = abs(dist_to_tss),
         significant = if_else(significant == "sig", true = 1, false = 0))

# write to output file
pairs <- select(pairs, -enh_in_intron)  # remove column not used in enhancer prediction analysis
write.csv(pairs, file = here(snakemake@output[[1]]), quote = FALSE, row.names = FALSE)

R tidyverse here Matrix ucsc-liftover GenomicAlignments HiTC From line 4 of chromatin_annotated_etps/enhancer_screen_etps.R

library(GenomicAlignments)
library(GenomicFeatures)
library(here)
library(HiTC)
library(Matrix)
library(rtracklayer)
library(tidyverse)
library(BiocParallel)

# backend for parallel computing
register(MulticoreParam(workers = snakemake@threads))

# Expressed genes ==================================================================================

# Whole-Tx CROP-seq data is used to compute expression in K562 cells.

# import gencode hg38 annotations
annot_url <- "ftp://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.chr.gtf.gz"
annot <- import(annot_url, format = "gtf")

# extract exon annotations for MYC and GATA1
genes <- annot[annot$type == "exon" & annot$gene_biotype == "protein_coding" &
                 annot$transcript_biotype == "protein_coding" &
                 annot$gene_name %in% c("MYC", "GATA1")]

# genome-wide K562 expression data
dge <- read.table(here(snakemake@input$wholeTx_dge), header = TRUE, row.names = "GENE",
                  stringsAsFactors = FALSE)

# get dge data on target genes
gene_expr <- dge[rownames(dge) %in% genes$gene_name, ]

# compute average gene expression
avg_expr <- data.frame(gene = rownames(gene_expr), avg_txs = rowSums(gene_expr),
                       stringsAsFactors = FALSE, row.names = NULL)

# split gene annotations by gene name
expr_genes <- split(genes, f = genes$gene_name)

# Select enhancers - gene pairs ====================================================================

# Tested enhancers from Fulco et al. 2018 are extracted and ETPs are formed with MYC and GATA1.

# import enhancer sites
enh_coords <- import(here(snakemake@input$enhancers))

# create enhancer id
enh_coords$perturbation <- paste0(seqnames(enh_coords), ":", start(enh_coords), "-",
                                  end(enh_coords))

# download chain file for hg19 to hg38 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz"
chain_file <- tempfile("hg19ToHg38", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg19_to_hg38_chain <- import.chain(tools::file_path_sans_ext(chain_file))

# liftover enhancers from hg38 to hg19
enh_coords <- enh_coords %>%
  liftOver(chain = hg19_to_hg38_chain)

# filter out enhancers that did not lift over correctly (split into 2 loci)
enh_loci <- sapply(enh_coords, length)
enh_loci_good <- which(enh_loci == 1)
enh_coords <- enh_coords[enh_loci_good]

# convert to GRanges object
enh_coords <- unlist(enh_coords)

# function to get a genes TSS
get_tss <- function(x) {
  if (all(strand(x) == "+")) {
    tss <- min(start(x))
  }else if (all(strand(x) == "-")) {
    tss <- max(end(x))
  }else{
    stop("Inconsistent strand information!")
  }
  GRanges(seqnames = unique(seqnames(x)), ranges = IRanges(start = tss, end = tss),
          strand = unique(strand(x)), gene_name = unique(x$gene_name))
}

# get tss for all genes
gene_tss <- endoapply(expr_genes, FUN = get_tss)
gene_tss <- unlist(gene_tss)

# create data.frame containing enhancer data
enh_df <- enh_coords %>% 
  data.frame(stringsAsFactors = FALSE) %>% 
  select(perturbation, enh_chr = seqnames, enh_start = start, enh_end = end) %>% 
  mutate(enh_chr = as.character(enh_chr))

# remove enhancers, where liftover moved enhancer to other chromosome
enh_df <- enh_df %>% 
  mutate(chr_id = sub(":.+", "", perturbation)) %>% 
  filter(chr_id == enh_chr) %>% 
  select(-chr_id)

# create data.frame containing gene tss information
tss_df <- gene_tss %>% 
  data.frame(stringsAsFactors = FALSE) %>% 
  select(gene = gene_name, gene_chr = seqnames, gene_start = start, gene_end = end,
         gene_strand = strand) %>% 
  mutate(gene_strand = as.character(gene_strand),
         gene_chr = paste0("chr", as.character(gene_chr)))

# combine enhancers and tss data to form etps
etps <- left_join(enh_df, tss_df, by = c("enh_chr" = "gene_chr")) %>% 
  mutate(gene_chr = enh_chr) %>% 
  select(perturbation, gene, enh_chr, enh_start, enh_end, gene_chr, gene_tss = gene_start,
         gene_strand)

# calculate distance to tss for every enhancer - gene pair
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# Chromatin activity ===============================================================================

# Each ETP is annotated for chromatin activity at the involved enhancer. The RPKM is calculated to
# normalize for both the sequencing depth and enhancer size.

# chromatin assays and input files
chrom_infiles <- here(snakemake@input$encode_bam)
names(chrom_infiles) <- sub("_.+", "", basename(chrom_infiles))

# load all bam files
chrom_reads <- bplapply(chrom_infiles, FUN = readGAlignments)

# extract hg38 coordinates of all enhancers
enh_coords <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# count the number of reads within each enhancer
enh_chrom_reads <- chrom_reads %>%
  bplapply(FUN = countOverlaps, query = enh_coords) %>%
  bind_cols()

# combine with enhancer coordinates and convert to long format
enh_chrom <- enh_coords %>%
  data.frame(stringsAsFactors = FALSE) %>%
  select(-strand) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  bind_cols(enh_chrom_reads) %>%
  gather(key = assay, value = reads, -c(1:5))

# normalize for sequencing depth and enhancer size by calculating rpkm
enh_chrom_norm <- enh_chrom %>%
  group_by(assay) %>%
  mutate(rpkm = reads / (sum(reads) / 1e6) / (width / 1000))

# convert to wide format add to enhancer gene pair results
etps <- enh_chrom_norm %>%
  select(-c(enh_chr, enh_start, enh_end, width, reads)) %>%
  spread(key = "assay", value = "rpkm") %>%
  left_join(x = etps, y = ., by = "perturbation")

# remove data to free up memory
rm(enh_chrom_reads, chrom_reads, enh_chrom, enh_chrom_norm)
rm(annot, dge, expr_genes, gene_expr, genes)
gc()

# Annotate Hi-C ====================================================================================

# Hi-C contact matrices from Rao et al., 2014 are imported and the contact frequency for each
# enhancer - gene pair is calculated.

# Liftover enhancer and gene TSS coordinates -------------------------------------------------------

# Hi-C contact matrices are in hg19 annotation, so enhancer and gene TSS coordinates need to be
# lifted over from hg38 to hg19.

# download chain file for hg38 to hg19 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
chain_file <- tempfile("hg38ToHg19", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg38_to_hg19_chain <- import.chain(tools::file_path_sans_ext(chain_file))
closeAllConnections()  # close any open file connection

# extract hg38 coordinates of all enhancers
enh_coords_hg38 <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# liftover enhancers from hg38 to hg19 and convert to data.frame
enh_coords_hg19 <- enh_coords_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain)

# filter out enhancers that did not lift over correctly (split into 2 loci)
enh_loci <- sapply(enh_coords_hg19, length)
enh_loci_good <- which(enh_loci == 1)
enh_coords_hg19 <- enh_coords_hg19[enh_loci_good]

# reformat
enh_coords_hg19 <- enh_coords_hg19 %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, end, perturbation) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  mutate(enh_chr = as.character(enh_chr))

# replace tested enhancer - gene pairs hg38 enhancer coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(enh_chr, enh_start, enh_end)) %>%
  left_join(enh_coords_hg19, by = "perturbation")

# extract gene TSS coordinates
gene_tss_coords <- etps %>%
  select(gene, gene_chr, gene_tss, gene_strand) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "gene_chr",
                           start.field = "gene_tss", end.field = "gene_tss",
                           strand.field = "gene_strand")

# liftover tss coordinates to hg19
gene_tss_coords_hg19 <- gene_tss_coords %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, strand, gene) %>%
  rename(gene_chr = seqnames, gene_tss = start, gene_strand = strand) %>%
  mutate(gene_chr = as.character(gene_chr), gene_strand = as.character(gene_strand))

# replace gene tss hg38 coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(gene_chr, gene_tss, gene_strand)) %>%
  left_join(gene_tss_coords_hg19, by = "gene")

# recalculate distance to tss for hg19 coordinates
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# remove enhancers that were not lifted over or switched chromosomes...
etps <- filter(etps, enh_chr == gene_chr)

# Prepare Hi-C data --------------------------------------------------------------------------------

# Contact matrices for K562 cells from Rao et al. 2014 are imported (5kb resolution). The provided
# normalization vectors are used to normalize the observed contacts (read counts). Data from the two
# chromosonal regions on chromosome 8 and 11 is extracted for any further analyses.

# function to import HiC data from Rao et al for one chromosome and create a HTCexp object
import_hic <- function(sparse_cm_file, chromosome, resolution, bins) {

  # load sparse contact matrix file (only observed contacts)
  obs_contacts <- read.table(sparse_cm_file, col.names = c("i", "j", "M_ij"), sep = "\t")

  # get starting coordinates of assessed genomic bins at 5kb resolution
  max_bin <- (bins - 1) * resolution
  bin_starts <- seq(from = 0, to = max_bin, by = resolution)

  # create GRanges object containing all assessed bins for that chromosome
  bin_coords <- GRanges(seqnames = chromosome,
                        ranges = IRanges(start = bin_starts, end = bin_starts + resolution - 1,
                                         names = paste0("bin_", seq_len(length(bin_starts))))
  )

  # convert starting coordinates of bins in sparse matrix input to bin ids by dividing by the
  # resolution (and add 1 to get correct index)
  obs_contacts_bins <- data.frame(i = round(obs_contacts$i / resolution + 1),
                                  j = round(obs_contacts$j / resolution + 1), 
                                  M_ij = obs_contacts$M_ij)

  # create sparse contact matrix from observed contacts
  sparse_cm <- sparseMatrix(i = obs_contacts_bins$i, j = obs_contacts_bins$j,
                            x = obs_contacts_bins$M_ij, symmetric = TRUE, dims = c(bins, bins))

  # create HTCexp object containing data for the given chromosome
  HTCexp(intdata = sparse_cm, xgi = bin_coords, ygi = bin_coords)

}

# k562 intrachromosomal sparse matrix files
scm_files <- here(snakemake@input$hic_raw)
names(scm_files) <- sub("_5kb.RAWobserved", "", basename(scm_files))

# normalization vector files
krnorm_files <- here(snakemake@input$hic_norm)
names(krnorm_files) <- sub("_5kb.KRnorm", "", basename(krnorm_files))

# import normalization vectors
krnorm <- lapply(krnorm_files, FUN = function(x) as.numeric(readLines(x)) )

# infer number of bins per chromosome based on the normalization vectors
hic_bins <- lapply(krnorm, FUN = length)

# import hi-c data for these chromosomes
hic_data <- bpmapply(FUN = import_hic,
                     sparse_cm_file = scm_files,
                     chromosome = names(scm_files),
                     bins = hic_bins,
                     MoreArgs = list(resolution = 5000))

# Enhancer - gene pairs ----------------------------------------------------------------------------

# The interaction frequency of all tested enhancer - gene pair computed, defined as the interaction
# frequency of the genomic bins overlapping the enhancer and the target genes TSS.

# function to compute the interaction frequency for enhancer - gene pairs, by finding the hi-c 
# genomic bins with overlap with the enhancer and the target gene tss. the interaction frequency of
# the pair is then defined as the interaction frequency of these bins
compute_int_freq <- function(pairs, htc_object, norm_vector) {

  # add pair identifier
  pairs$pair <- seq_len(nrow(pairs))

  # get coordinates of enhancer centers as GRanges object
  enh_coords <- pairs %>%
    mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
    select(enh_chr, enh_center) %>%
    makeGRangesFromDataFrame(., seqnames.field = "enh_chr", start.field = "enh_center",
                             end.field = "enh_center")

  # get gene tss coordinates as GRanges object
  tss_coords <- pairs %>%
    select(gene_chr, gene_tss) %>%
    makeGRangesFromDataFrame(., seqnames.field = "gene_chr", start.field = "gene_tss",
                             end.field = "gene_tss")

  # get bins overlapping for all enhancers and gene tss
  hic_bins <- x_intervals(htc_object)
  enh_bins <- findOverlaps(query = enh_coords, subject = hic_bins)
  tss_bins <- findOverlaps(query = tss_coords, subject = hic_bins)

  # combine into one data.frame
  enh_bins <- data.frame(pair = from(enh_bins), enh_bin = to(enh_bins))
  tss_bins <- data.frame(pair = from(tss_bins), tss_bin = to(tss_bins))
  bins <- full_join(enh_bins, tss_bins, by = "pair")

  # extract distance matrix between bins from htc object
  dists <- intervalsDist(htc_object)

  # calculate distances between bins of enhancer gene pairs
  dist_pairs <- dists[as.matrix(bins[, 2:3])]
  dist_pairs <- data.frame(pair = bins$pair, dist_bins = dist_pairs)

  # extract hi-c interaction matrix from htc object
  intdata <- intdata(htc_object)

  # get interaction frequencies and normalization factors for all bins
  int_freq_pairs <- intdata[as.matrix(bins[, 2:3])]

  # compute normalization factors
  norm_vector_values <- matrix(norm_vector[as.matrix(bins[, 2:3])], ncol = 2)
  norm_factors <- norm_vector_values[, 1] * norm_vector_values[, 2]

  # normalize data and add and add pair id
  int_freq_pairs <- int_freq_pairs / norm_factors
  int_freq_pairs <- data.frame(pair = bins$pair, int_freq = int_freq_pairs)

  # add interaction frequencies and bin distances to pairs to create output
  pairs %>%
    left_join(dist_pairs, by = "pair") %>%
    left_join(int_freq_pairs, by = "pair") %>%
    select(-c(pair))

}

# compute interaction frequencies for all tested enhancer - gene pairs
etps_chrs <- split(etps, f = etps$enh_chr)
hic_data <- hic_data[names(etps_chrs)]
krnorm <- krnorm[names(etps_chrs)]
int_freqs <- bpmapply(FUN = compute_int_freq, pairs = etps_chrs, htc_object = hic_data,
                      norm_vector = krnorm, SIMPLIFY = FALSE)

# combine into one data.frame
chrom_annot_etps <- bind_rows(int_freqs)

# Activity by contact ==============================================================================

# The activity by contact score is computed as proposed by Fulco et al. The geometric mean between
# H3K27ac and DNAse-seq is then calculatated for every site to compute an enhancer activity score
# proposed by Fulco et al. The ABC score is then computed by multiplying the activity score with the
# Hi-C interaction frequency.

# function to calculate geometric mean
gm_mean <- function(x, na.rm = TRUE){
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# compute enhancer activity score
chrom_annot_etps <- chrom_annot_etps %>%
  rowwise() %>%
  mutate(activity_score = gm_mean(c(`Dnase-seq`, H3K27ac))) %>%
  ungroup()

# compute abc score
chrom_annot_etps <- chrom_annot_etps %>%
  group_by(gene) %>%
  mutate(abc_score = activity_score * (int_freq + 1),
         abc_score_norm = abc_score / sum(abc_score))

# Finalize output ==================================================================================

# The output is finalized by adding the average gene expression, transforming the distance to TSS to
# absolute values and pruning pairs based on the distance to TSS.

# add average gene expression
chrom_annot_etps <- left_join(chrom_annot_etps, avg_expr, by = "gene")

# transform distance to TSS
chrom_annot_etps <- mutate(chrom_annot_etps, dist_to_tss = abs(dist_to_tss))

# rearrange columns
columns <- c("perturbation", "gene", "avg_txs", "enh_chr", "enh_start", "enh_end", "gene_chr",
             "gene_tss", "gene_strand", "dist_to_tss", "Dnase-seq", "H3K27ac", "H3K27me3",
             "H3K4me1", "H3K4me3", "POLR2A", "dist_bins", "int_freq", "activity_score",
             "abc_score", "abc_score_norm")
chrom_annot_etps <- chrom_annot_etps[, columns]

# write to output file
write.csv(chrom_annot_etps, file = here(snakemake@output[[1]]), quote = FALSE, row.names = FALSE)

R tidyverse here Matrix ucsc-liftover GenomicAlignments HiTC From line 4 of chromatin_annotated_etps/fulco_tiling_etps.R

library(GenomicAlignments)
library(GenomicFeatures)
library(here)
library(HiTC)
library(Matrix)
library(rtracklayer)
library(tidyverse)
library(BiocParallel)

# backend for parallel computing
register(MulticoreParam(workers = snakemake@threads))

# Expressed genes ==================================================================================

# Whole-Tx CROP-seq data is used to find expressed genes in K562 cells.

# import gencode hg38 annotations
annot_url <- "ftp://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.chr.gtf.gz"
annot <- import(annot_url, format = "gtf")

# extract exon annotations for protein-coding genes on autosomes and chromosome X
genes <- annot[annot$type == "exon" & annot$gene_biotype == "protein_coding" &
                 annot$transcript_biotype == "protein_coding" &
                 !seqnames(annot) %in% c("Y", "MT")]

# genome-wide K562 expression data
dge <- read.table(here(snakemake@input$wholeTx_dge), header = TRUE, row.names = "GENE",
                  stringsAsFactors = FALSE)

# only retain dge data on protein-coding genes
gene_expr <- dge[rownames(dge) %in% genes$gene_name, ]

# compute average gene expression
avg_expr <- data.frame(gene = rownames(gene_expr), avg_txs = rowSums(gene_expr),
                       stringsAsFactors = FALSE, row.names = NULL)

# extract genes with 100 or more transcripts on average
expr_gene_names <- avg_expr %>% 
  filter(avg_txs >= 100) %>% 
  pull(gene)

# get annotations for these genes and split by gene name
expr_genes <- genes[genes$gene_name %in% expr_gene_names]
expr_genes <- split(expr_genes, f = expr_genes$gene_name)

# Select enhancers - gene pairs ====================================================================

# Tested enhancers from the pilot experiments in Gasperini et al. 2019 are used to create ETPs with
# all expressed genes within 300kb.

# import enhancer sites
enh_coords <- import(snakemake@input$enhancers)

# remove duplicates
enh_coords <- unique(enh_coords)

# create enhancer id
enh_coords$perturbation <- paste0(seqnames(enh_coords), ":", start(enh_coords), "-",
                                  end(enh_coords))

# download chain file for hg19 to hg38 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz"
chain_file <- tempfile("hg19ToHg38", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg19_to_hg38_chain <- import.chain(tools::file_path_sans_ext(chain_file))

# liftover enhancers from hg38 to hg19
enh_coords <- enh_coords %>%
  liftOver(chain = hg19_to_hg38_chain)

# filter out enhancers that did not lift over correctly (split into 2 loci)
enh_loci <- sapply(enh_coords, length)
enh_loci_good <- which(enh_loci == 1)
enh_coords <- enh_coords[enh_loci_good]

# convert to GRanges object
enh_coords <- unlist(enh_coords)

# function to get a genes TSS
get_tss <- function(x) {
  if (all(strand(x) == "+")) {
    tss <- min(start(x))
  }else if (all(strand(x) == "-")) {
    tss <- max(end(x))
  }else{
    stop("Inconsistent strand information!")
  }
  GRanges(seqnames = unique(seqnames(x)), ranges = IRanges(start = tss, end = tss),
          strand = unique(strand(x)), gene_name = unique(x$gene_name))
}

# get tss for all genes
gene_tss <- endoapply(expr_genes, FUN = get_tss)
gene_tss <- unlist(gene_tss)

# adjust chromosome names of enhancer coordinates
seqlevels(enh_coords) <- sub("chr", "", seqlevels(enh_coords))

# find enhancers that overlap a 600kb window centered on each genes tss
tss_windows <- promoters(gene_tss, upstream = 300000, downstream = 300000)
overlaps <- findOverlaps(query = enh_coords, subject = tss_windows)

# get overlapping gene and gene tss
ol_enh <- enh_coords[from(overlaps)]
ol_tss <- gene_tss[to(overlaps)]

# create data.frame containing etps
enh_df <- data.frame(ol_enh)
tss_df <- data.frame(ol_tss)
etps <- data.frame(perturbation = enh_df$perturbation, gene = tss_df$gene_name,
                   enh_chr = as.character(enh_df$seqnames), enh_start = enh_df$start,
                   enh_end = enh_df$end, gene_chr = as.character(tss_df$seqnames),
                   gene_tss = tss_df$start, gene_strand = as.character(tss_df$strand),
                   stringsAsFactors = FALSE)

# calculate distance to tss for every enhancer - gene pair
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# reformat chromosome ids
etps <- mutate(etps, enh_chr = paste0("chr", enh_chr), gene_chr = paste0("chr", gene_chr))

# Chromatin activity ===============================================================================

# Each ETP is annotated for chromatin activity at the involved enhancer. The RPKM is calculated to
# normalize for both the sequencing depth and enhancer size.

# chromatin assays and input files
chrom_infiles <- here(snakemake@input$encode_bam)
names(chrom_infiles) <- sub("_.+", "", basename(chrom_infiles))

# load all bam files
chrom_reads <- bplapply(chrom_infiles, FUN = readGAlignments)

# extract hg38 coordinates of all enhancers
enh_coords <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# count the number of reads within each enhancer
enh_chrom_reads <- chrom_reads %>%
  bplapply(FUN = countOverlaps, query = enh_coords) %>%
  bind_cols()

# combine with enhancer coordinates and convert to long format
enh_chrom <- enh_coords %>%
  data.frame(stringsAsFactors = FALSE) %>%
  select(-strand) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  bind_cols(enh_chrom_reads) %>%
  gather(key = assay, value = reads, -c(1:5))

# normalize for sequencing depth and enhancer size by calculating rpkm
enh_chrom_norm <- enh_chrom %>%
  group_by(assay) %>%
  mutate(rpkm = reads / (sum(reads) / 1e6) / (width / 1000))

# convert to wide format add to enhancer gene pair results
etps <- enh_chrom_norm %>%
  select(-c(enh_chr, enh_start, enh_end, width, reads)) %>%
  spread(key = "assay", value = "rpkm") %>%
  left_join(x = etps, y = ., by = "perturbation")

# remove data to free up memory
rm(enh_chrom_reads, chrom_reads, enh_chrom, enh_chrom_norm)
rm(annot, dge, expr_genes, gene_expr, genes)
gc()

# Annotate Hi-C ====================================================================================

# Hi-C contact matrices from Rao et al., 2014 are imported and the contact frequency for each
# enhancer - gene pair is calculated.

# Liftover enhancer and gene TSS coordinates -------------------------------------------------------

# Hi-C contact matrices are in hg19 annotation, so enhancer and gene TSS coordinates need to be
# lifted over from hg38 to hg19.

# download chain file for hg38 to hg19 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
chain_file <- tempfile("hg38ToHg19", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg38_to_hg19_chain <- import.chain(tools::file_path_sans_ext(chain_file))
closeAllConnections()  # close any open file connection

# extract hg38 coordinates of all enhancers
enh_coords_hg38 <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# liftover enhancers from hg38 to hg19 and convert to data.frame
enh_coords_hg19 <- enh_coords_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain)

# filter out enhancers that did not lift over correctly (split into 2 loci)
enh_loci <- sapply(enh_coords_hg19, length)
enh_loci_good <- which(enh_loci == 1)
enh_coords_hg19 <- enh_coords_hg19[enh_loci_good]

# reformat
enh_coords_hg19 <- enh_coords_hg19 %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, end, perturbation) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  mutate(enh_chr = as.character(enh_chr))

# replace tested enhancer - gene pairs hg38 enhancer coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(enh_chr, enh_start, enh_end)) %>%
  left_join(enh_coords_hg19, by = "perturbation")

# extract gene TSS coordinates
gene_tss_coords <- etps %>%
  select(gene, gene_chr, gene_tss, gene_strand) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "gene_chr",
                           start.field = "gene_tss", end.field = "gene_tss",
                           strand.field = "gene_strand")

# liftover tss coordinates to hg19
gene_tss_coords_hg19 <- gene_tss_coords %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, strand, gene) %>%
  rename(gene_chr = seqnames, gene_tss = start, gene_strand = strand) %>%
  mutate(gene_chr = as.character(gene_chr), gene_strand = as.character(gene_strand))

# replace gene tss hg38 coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(gene_chr, gene_tss, gene_strand)) %>%
  left_join(gene_tss_coords_hg19, by = "gene")

# recalculate distance to tss for hg19 coordinates
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# remove enhancers that were not lifted over or switched chromosomes...
etps <- filter(etps, enh_chr == gene_chr)

# Prepare Hi-C data --------------------------------------------------------------------------------

# Contact matrices for K562 cells from Rao et al. 2014 are imported (5kb resolution). The provided
# normalization vectors are used to normalize the observed contacts (read counts). Data from the two
# chromosonal regions on chromosome 8 and 11 is extracted for any further analyses.

# function to import HiC data from Rao et al for one chromosome and create a HTCexp object
import_hic <- function(sparse_cm_file, chromosome, resolution, bins) {

  # load sparse contact matrix file (only observed contacts)
  obs_contacts <- read.table(sparse_cm_file, col.names = c("i", "j", "M_ij"), sep = "\t")

  # get starting coordinates of assessed genomic bins at 5kb resolution
  max_bin <- (bins - 1) * resolution
  bin_starts <- seq(from = 0, to = max_bin, by = resolution)

  # create GRanges object containing all assessed bins for that chromosome
  bin_coords <- GRanges(seqnames = chromosome,
                        ranges = IRanges(start = bin_starts, end = bin_starts + resolution - 1,
                                         names = paste0("bin_", seq_len(length(bin_starts))))
  )

  # convert starting coordinates of bins in sparse matrix input to bin ids by dividing by the
  # resolution (and add 1 to get correct index)
  obs_contacts_bins <- data.frame(i = round(obs_contacts$i / resolution + 1),
                                  j = round(obs_contacts$j / resolution + 1), 
                                  M_ij = obs_contacts$M_ij)

  # create sparse contact matrix from observed contacts
  sparse_cm <- sparseMatrix(i = obs_contacts_bins$i, j = obs_contacts_bins$j,
                            x = obs_contacts_bins$M_ij, symmetric = TRUE, dims = c(bins, bins))

  # create HTCexp object containing data for the given chromosome
  HTCexp(intdata = sparse_cm, xgi = bin_coords, ygi = bin_coords)

}

# k562 intrachromosomal sparse matrix files
scm_files <- here(snakemake@input$hic_raw)
names(scm_files) <- sub("_5kb.RAWobserved", "", basename(scm_files))
scm_files <- scm_files[unique(etps$enh_chr)]

# normalization vector files
krnorm_files <- here(snakemake@input$hic_norm)
names(krnorm_files) <- sub("_5kb.KRnorm", "", basename(krnorm_files))
krnorm_files <- krnorm_files[unique(etps$enh_chr)]

# import normalization vectors
krnorm <- lapply(krnorm_files, FUN = function(x) as.numeric(readLines(x)) )

# infer number of bins per chromosome based on the normalization vectors
hic_bins <- lapply(krnorm, FUN = length)

# import hi-c data for these chromosomes
hic_data <- bpmapply(FUN = import_hic,
                     sparse_cm_file = scm_files,
                     chromosome = names(scm_files),
                     bins = hic_bins,
                     MoreArgs = list(resolution = 5000))

## Enhancer - gene pairs ---------------------------------------------------------------------------

# The interaction frequency of all tested enhancer - gene pair computed, defined as the interaction
# frequency of the genomic bins overlapping the enhancer and the target genes TSS.

# function to compute the interaction frequency for enhancer - gene pairs, by finding the hi-c 
# genomic bins with overlap with the enhancer and the target gene tss. the interaction frequency of
# the pair is then defined as the interaction frequency of these bins
compute_int_freq <- function(pairs, htc_object, norm_vector) {

  # add pair identifier
  pairs$pair <- seq_len(nrow(pairs))

  # get coordinates of enhancer centers as GRanges object
  enh_coords <- pairs %>%
    mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
    select(enh_chr, enh_center) %>%
    makeGRangesFromDataFrame(., seqnames.field = "enh_chr", start.field = "enh_center",
                             end.field = "enh_center")

  # get gene tss coordinates as GRanges object
  tss_coords <- pairs %>%
    select(gene_chr, gene_tss) %>%
    makeGRangesFromDataFrame(., seqnames.field = "gene_chr", start.field = "gene_tss",
                             end.field = "gene_tss")

  # get bins overlapping for all enhancers and gene tss
  hic_bins <- x_intervals(htc_object)
  enh_bins <- findOverlaps(query = enh_coords, subject = hic_bins)
  tss_bins <- findOverlaps(query = tss_coords, subject = hic_bins)

  # combine into one data.frame
  enh_bins <- data.frame(pair = from(enh_bins), enh_bin = to(enh_bins))
  tss_bins <- data.frame(pair = from(tss_bins), tss_bin = to(tss_bins))
  bins <- full_join(enh_bins, tss_bins, by = "pair")

  # extract distance matrix between bins from htc object
  dists <- intervalsDist(htc_object)

  # calculate distances between bins of enhancer gene pairs
  dist_pairs <- dists[as.matrix(bins[, 2:3])]
  dist_pairs <- data.frame(pair = bins$pair, dist_bins = dist_pairs)

  # extract hi-c interaction matrix from htc object
  intdata <- intdata(htc_object)

  # get interaction frequencies and normalization factors for all bins
  int_freq_pairs <- intdata[as.matrix(bins[, 2:3])]

  # compute normalization factors
  norm_vector_values <- matrix(norm_vector[as.matrix(bins[, 2:3])], ncol = 2)
  norm_factors <- norm_vector_values[, 1] * norm_vector_values[, 2]

  # normalize data and add and add pair id
  int_freq_pairs <- int_freq_pairs / norm_factors
  int_freq_pairs <- data.frame(pair = bins$pair, int_freq = int_freq_pairs)

  # add interaction frequencies and bin distances to pairs to create output
  pairs %>%
    left_join(dist_pairs, by = "pair") %>%
    left_join(int_freq_pairs, by = "pair") %>%
    select(-c(pair))

}

# compute interaction frequencies for all tested enhancer - gene pairs
etps_chrs <- split(etps, f = etps$enh_chr)
hic_data <- hic_data[names(etps_chrs)]
krnorm <- krnorm[names(etps_chrs)]
int_freqs <- bpmapply(FUN = compute_int_freq, pairs = etps_chrs, htc_object = hic_data,
                      norm_vector = krnorm, SIMPLIFY = FALSE)

# combine into one data.frame
chrom_annot_etps <- bind_rows(int_freqs)

# Activity by contact ==============================================================================

# The activity by contact score is computed as proposed by Fulco et al. The geometric mean between
# H3K27ac and DNAse-seq is then calculatated for every site to compute an enhancer activity score
# proposed by Fulco et al. The ABC score is then computed by multiplying the activity score with the
# Hi-C interaction frequency.

# function to calculate geometric mean
gm_mean <- function(x, na.rm = TRUE){
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# compute enhancer activity score
chrom_annot_etps <- chrom_annot_etps %>%
  rowwise() %>%
  mutate(activity_score = gm_mean(c(`Dnase-seq`, H3K27ac))) %>%
  ungroup()

# compute abc score
chrom_annot_etps <- chrom_annot_etps %>%
  group_by(gene) %>%
  mutate(abc_score = activity_score * (int_freq + 1),
         abc_score_norm = abc_score / sum(abc_score))

# Finalize output ==================================================================================

# The output is finalized by adding the average gene expression, transforming the distance to TSS to
# absolute values and pruning pairs based on the distance to TSS.

# add average gene expression
chrom_annot_etps <- left_join(chrom_annot_etps, avg_expr, by = "gene")

# transform distance to TSS
chrom_annot_etps <- mutate(chrom_annot_etps, dist_to_tss = abs(dist_to_tss))

# only retain pairs with distance to TSS between 1 and 300 kb
chrom_annot_etps <- filter(chrom_annot_etps, dist_to_tss >= 1000, dist_to_tss <= 300000)

# rearrange columns
columns <- c("perturbation", "gene", "avg_txs", "enh_chr", "enh_start", "enh_end", "gene_chr",
             "gene_tss", "gene_strand", "dist_to_tss", "Dnase-seq", "H3K27ac", "H3K27me3",
             "H3K4me1", "H3K4me3", "POLR2A", "dist_bins", "int_freq", "activity_score",
             "abc_score", "abc_score_norm")
chrom_annot_etps <- chrom_annot_etps[, columns]

# write to output file
write.csv(chrom_annot_etps, file = here(snakemake@output[[1]]), quote = FALSE, row.names = FALSE)

R tidyverse here Matrix ucsc-liftover GenomicAlignments HiTC From line 4 of chromatin_annotated_etps/gasperini_pilot_etps.R

library(GenomicAlignments)
library(GenomicFeatures)
library(here)
library(HiTC)
library(Matrix)
library(rtracklayer)
library(tidyverse)
library(BiocParallel)

# backend for parallel computing
register(MulticoreParam(workers = snakemake@threads))

# Expressed genes ==================================================================================

# Whole-Tx CROP-seq data is used to find expressed genes in K562 cells.

# import gencode hg38 annotations
annot_url <- "ftp://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.chr.gtf.gz"
annot <- import(annot_url, format = "gtf")

# extract exon annotations for protein-coding genes on autosomes and chromosome X
genes <- annot[annot$type == "exon" & annot$gene_biotype == "protein_coding" &
                 annot$transcript_biotype == "protein_coding" &
                 !seqnames(annot) %in% c("Y", "MT")]

# genome-wide K562 expression data
dge <- read.table(here(snakemake@input$wholeTx_dge), header = TRUE, row.names = "GENE",
                  stringsAsFactors = FALSE)

# only retain dge data on protein-coding genes
gene_expr <- dge[rownames(dge) %in% genes$gene_name, ]

# compute average gene expression
avg_expr <- data.frame(gene = rownames(gene_expr), avg_txs = rowSums(gene_expr),
                       stringsAsFactors = FALSE, row.names = NULL)

# extract genes with 100 or more transcripts on average
expr_gene_names <- avg_expr %>% 
  filter(avg_txs >= 100) %>% 
  pull(gene)

# get annotations for these genes and split by gene name
expr_genes <- genes[genes$gene_name %in% expr_gene_names]
expr_genes <- split(expr_genes, f = expr_genes$gene_name)

# Select enhancers - gene pairs ====================================================================

# Tested enhancers from the enhancer screen in Gasperini et al. 2019 are used to create ETPs with
# all expressed genes within 300kb.

# import enhancer sites
enh_coords <- import(snakemake@input$enhancers)

# remove duplicates
enh_coords <- unique(enh_coords)

# create enhancer id
enh_coords$perturbation <- paste0(seqnames(enh_coords), ":", start(enh_coords), "-",
                                  end(enh_coords))

# download chain file for hg19 to hg38 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz"
chain_file <- tempfile("hg19ToHg38", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg19_to_hg38_chain <- import.chain(tools::file_path_sans_ext(chain_file))

# liftover enhancers from hg38 to hg19
enh_coords <- enh_coords %>%
  liftOver(chain = hg19_to_hg38_chain)

# filter out enhancers that did not lift over correctly (split into 2 loci)
enh_loci <- sapply(enh_coords, length)
enh_loci_good <- which(enh_loci == 1)
enh_coords <- enh_coords[enh_loci_good]

# convert to GRanges object
enh_coords <- unlist(enh_coords)

# function to get a genes TSS
get_tss <- function(x) {
  if (all(strand(x) == "+")) {
    tss <- min(start(x))
  }else if (all(strand(x) == "-")) {
    tss <- max(end(x))
  }else{
    stop("Inconsistent strand information!")
  }
  GRanges(seqnames = unique(seqnames(x)), ranges = IRanges(start = tss, end = tss),
          strand = unique(strand(x)), gene_name = unique(x$gene_name))
}

# get tss for all genes
gene_tss <- endoapply(expr_genes, FUN = get_tss)
gene_tss <- unlist(gene_tss)

# adjust chromosome names of enhancer coordinates
seqlevels(enh_coords) <- sub("chr", "", seqlevels(enh_coords))

# find enhancers that overlap a 600kb window centered on each genes tss
tss_windows <- promoters(gene_tss, upstream = 300000, downstream = 300000)
overlaps <- findOverlaps(query = enh_coords, subject = tss_windows)

# get overlapping gene and gene tss
ol_enh <- enh_coords[from(overlaps)]
ol_tss <- gene_tss[to(overlaps)]

# create data.frame containing etps
enh_df <- data.frame(ol_enh)
tss_df <- data.frame(ol_tss)
etps <- data.frame(perturbation = enh_df$perturbation, gene = tss_df$gene_name,
                   enh_chr = as.character(enh_df$seqnames), enh_start = enh_df$start,
                   enh_end = enh_df$end, gene_chr = as.character(tss_df$seqnames),
                   gene_tss = tss_df$start, gene_strand = as.character(tss_df$strand),
                   stringsAsFactors = FALSE)

# calculate distance to tss for every enhancer - gene pair
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# reformat chromosome ids
etps <- mutate(etps, enh_chr = paste0("chr", enh_chr), gene_chr = paste0("chr", gene_chr))

# Chromatin activity ===============================================================================

# Each ETP is annotated for chromatin activity at the involved enhancer. The RPKM is calculated to
# normalize for both the sequencing depth and enhancer size.

# chromatin assays and input files
chrom_infiles <- here(snakemake@input$encode_bam)
names(chrom_infiles) <- sub("_.+", "", basename(chrom_infiles))

# load all bam files
chrom_reads <- bplapply(chrom_infiles, FUN = readGAlignments)

# extract hg38 coordinates of all enhancers
enh_coords <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# count the number of reads within each enhancer
enh_chrom_reads <- chrom_reads %>%
  bplapply(FUN = countOverlaps, query = enh_coords) %>%
  bind_cols()

# combine with enhancer coordinates and convert to long format
enh_chrom <- enh_coords %>%
  data.frame(stringsAsFactors = FALSE) %>%
  select(-strand) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  bind_cols(enh_chrom_reads) %>%
  gather(key = assay, value = reads, -c(1:5))

# normalize for sequencing depth and enhancer size by calculating rpkm
enh_chrom_norm <- enh_chrom %>%
  group_by(assay) %>%
  mutate(rpkm = reads / (sum(reads) / 1e6) / (width / 1000))

# convert to wide format add to enhancer gene pair results
etps <- enh_chrom_norm %>%
  select(-c(enh_chr, enh_start, enh_end, width, reads)) %>%
  spread(key = "assay", value = "rpkm") %>%
  left_join(x = etps, y = ., by = "perturbation")

# remove data to free up memory
rm(enh_chrom_reads, chrom_reads, enh_chrom, enh_chrom_norm)
rm(annot, dge, expr_genes, gene_expr, genes)
invisible(gc())

# Annotate Hi-C ====================================================================================

# Hi-C contact matrices from Rao et al., 2014 are imported and the contact frequency for each
# enhancer - gene pair is calculated.

# Liftover enhancer and gene TSS coordinates -------------------------------------------------------

# Hi-C contact matrices are in hg19 annotation, so enhancer and gene TSS coordinates need to be
# lifted over from hg38 to hg19.

# download chain file for hg38 to hg19 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
chain_file <- tempfile("hg38ToHg19", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg38_to_hg19_chain <- import.chain(tools::file_path_sans_ext(chain_file))
closeAllConnections()  # close any open file connection

# extract hg38 coordinates of all enhancers
enh_coords_hg38 <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# liftover enhancers from hg38 to hg19 and convert to data.frame
enh_coords_hg19 <- enh_coords_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain)

# filter out enhancers that did not lift over correctly (split into 2 loci)
enh_loci <- sapply(enh_coords_hg19, length)
enh_loci_good <- which(enh_loci == 1)
enh_coords_hg19 <- enh_coords_hg19[enh_loci_good]

# reformat
enh_coords_hg19 <- enh_coords_hg19 %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, end, perturbation) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  mutate(enh_chr = as.character(enh_chr))

# replace tested enhancer - gene pairs hg38 enhancer coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(enh_chr, enh_start, enh_end)) %>%
  left_join(enh_coords_hg19, by = "perturbation")

# extract gene TSS coordinates
gene_tss_coords <- etps %>%
  select(gene, gene_chr, gene_tss, gene_strand) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "gene_chr",
                           start.field = "gene_tss", end.field = "gene_tss",
                           strand.field = "gene_strand")

# liftover tss coordinates to hg19
gene_tss_coords_hg19 <- gene_tss_coords %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, strand, gene) %>%
  rename(gene_chr = seqnames, gene_tss = start, gene_strand = strand) %>%
  mutate(gene_chr = as.character(gene_chr), gene_strand = as.character(gene_strand))

# replace gene tss hg38 coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(gene_chr, gene_tss, gene_strand)) %>%
  left_join(gene_tss_coords_hg19, by = "gene")

# recalculate distance to tss for hg19 coordinates
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# remove enhancers that were not lifted over or switched chromosomes...
etps <- filter(etps, enh_chr == gene_chr)

# Prepare Hi-C data --------------------------------------------------------------------------------

# Contact matrices for K562 cells from Rao et al. 2014 are imported (5kb resolution). The provided
# normalization vectors are used to normalize the observed contacts (read counts). Data from the two
# chromosonal regions on chromosome 8 and 11 is extracted for any further analyses.

# function to import HiC data from Rao et al for one chromosome and create a HTCexp object
import_hic <- function(sparse_cm_file, chromosome, resolution, bins) {

  # load sparse contact matrix file (only observed contacts)
  obs_contacts <- read.table(sparse_cm_file, col.names = c("i", "j", "M_ij"), sep = "\t")

  # get starting coordinates of assessed genomic bins at 5kb resolution
  max_bin <- (bins - 1) * resolution
  bin_starts <- seq(from = 0, to = max_bin, by = resolution)

  # create GRanges object containing all assessed bins for that chromosome
  bin_coords <- GRanges(seqnames = chromosome,
                        ranges = IRanges(start = bin_starts, end = bin_starts + resolution - 1,
                                         names = paste0("bin_", seq_len(length(bin_starts))))
  )

  # convert starting coordinates of bins in sparse matrix input to bin ids by dividing by the
  # resolution (and add 1 to get correct index)
  obs_contacts_bins <- data.frame(i = round(obs_contacts$i / resolution + 1),
                                  j = round(obs_contacts$j / resolution + 1), 
                                  M_ij = obs_contacts$M_ij)

  # create sparse contact matrix from observed contacts
  sparse_cm <- sparseMatrix(i = obs_contacts_bins$i, j = obs_contacts_bins$j,
                            x = obs_contacts_bins$M_ij, symmetric = TRUE, dims = c(bins, bins))

  # create HTCexp object containing data for the given chromosome
  HTCexp(intdata = sparse_cm, xgi = bin_coords, ygi = bin_coords)

}

# k562 intrachromosomal sparse matrix files
scm_files <- here(snakemake@input$hic_raw)
names(scm_files) <- sub("_5kb.RAWobserved", "", basename(scm_files))

# normalization vector files
krnorm_files <- here(snakemake@input$hic_norm)
names(krnorm_files) <- sub("_5kb.KRnorm", "", basename(krnorm_files))

# import normalization vectors
krnorm <- lapply(krnorm_files, FUN = function(x) as.numeric(readLines(x)) )

# infer number of bins per chromosome based on the normalization vectors
hic_bins <- lapply(krnorm, FUN = length)

# import hi-c data for these chromosomes
hic_data <- bpmapply(FUN = import_hic,
                     sparse_cm_file = scm_files,
                     chromosome = names(scm_files),
                     bins = hic_bins,
                     MoreArgs = list(resolution = 5000))

## Enhancer - gene pairs ---------------------------------------------------------------------------

# The interaction frequency of all tested enhancer - gene pair computed, defined as the interaction
# frequency of the genomic bins overlapping the enhancer and the target genes TSS.

# function to compute the interaction frequency for enhancer - gene pairs, by finding the hi-c 
# genomic bins with overlap with the enhancer and the target gene tss. the interaction frequency of
# the pair is then defined as the interaction frequency of these bins
compute_int_freq <- function(pairs, htc_object, norm_vector) {

  # add pair identifier
  pairs$pair <- seq_len(nrow(pairs))

  # get coordinates of enhancer centers as GRanges object
  enh_coords <- pairs %>%
    mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
    select(enh_chr, enh_center) %>%
    makeGRangesFromDataFrame(., seqnames.field = "enh_chr", start.field = "enh_center",
                             end.field = "enh_center")

  # get gene tss coordinates as GRanges object
  tss_coords <- pairs %>%
    select(gene_chr, gene_tss) %>%
    makeGRangesFromDataFrame(., seqnames.field = "gene_chr", start.field = "gene_tss",
                             end.field = "gene_tss")

  # get bins overlapping for all enhancers and gene tss
  hic_bins <- x_intervals(htc_object)
  enh_bins <- findOverlaps(query = enh_coords, subject = hic_bins)
  tss_bins <- findOverlaps(query = tss_coords, subject = hic_bins)

  # combine into one data.frame
  enh_bins <- data.frame(pair = from(enh_bins), enh_bin = to(enh_bins))
  tss_bins <- data.frame(pair = from(tss_bins), tss_bin = to(tss_bins))
  bins <- full_join(enh_bins, tss_bins, by = "pair")

  # extract distance matrix between bins from htc object
  dists <- intervalsDist(htc_object)

  # calculate distances between bins of enhancer gene pairs
  dist_pairs <- dists[as.matrix(bins[, 2:3])]
  dist_pairs <- data.frame(pair = bins$pair, dist_bins = dist_pairs)

  # extract hi-c interaction matrix from htc object
  intdata <- intdata(htc_object)

  # get interaction frequencies and normalization factors for all bins
  int_freq_pairs <- intdata[as.matrix(bins[, 2:3])]

  # compute normalization factors
  norm_vector_values <- matrix(norm_vector[as.matrix(bins[, 2:3])], ncol = 2)
  norm_factors <- norm_vector_values[, 1] * norm_vector_values[, 2]

  # normalize data and add and add pair id
  int_freq_pairs <- int_freq_pairs / norm_factors
  int_freq_pairs <- data.frame(pair = bins$pair, int_freq = int_freq_pairs)

  # add interaction frequencies and bin distances to pairs to create output
  pairs %>%
    left_join(dist_pairs, by = "pair") %>%
    left_join(int_freq_pairs, by = "pair") %>%
    select(-c(pair))

}

# compute interaction frequencies for all tested enhancer - gene pairs
etps_chrs <- split(etps, f = etps$enh_chr)
hic_data <- hic_data[names(etps_chrs)]
krnorm <- krnorm[names(etps_chrs)]
int_freqs <- bpmapply(FUN = compute_int_freq, pairs = etps_chrs, htc_object = hic_data,
                      norm_vector = krnorm, SIMPLIFY = FALSE)

# combine into one data.frame
chrom_annot_etps <- bind_rows(int_freqs)

# Activity by contact ==============================================================================

# The activity by contact score is computed as proposed by Fulco et al. The geometric mean between
# H3K27ac and DNAse-seq is then calculatated for every site to compute an enhancer activity score
# proposed by Fulco et al. The ABC score is then computed by multiplying the activity score with the
# Hi-C interaction frequency.

# function to calculate geometric mean
gm_mean <- function(x, na.rm = TRUE){
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# compute enhancer activity score
chrom_annot_etps <- chrom_annot_etps %>%
  rowwise() %>%
  mutate(activity_score = gm_mean(c(`Dnase-seq`, H3K27ac))) %>%
  ungroup()

# compute abc score
chrom_annot_etps <- chrom_annot_etps %>%
  group_by(gene) %>%
  mutate(abc_score = activity_score * (int_freq + 1),
         abc_score_norm = abc_score / sum(abc_score))

# Finalize output ==================================================================================

# The output is finalized by adding the average gene expression, transforming the distance to TSS to
# absolute values and pruning pairs based on the distance to TSS.

# add average gene expression
chrom_annot_etps <- left_join(chrom_annot_etps, avg_expr, by = "gene")

# transform distance to TSS
chrom_annot_etps <- mutate(chrom_annot_etps, dist_to_tss = abs(dist_to_tss))

# only retain pairs with distance to TSS between 1 and 300 kb
chrom_annot_etps <- filter(chrom_annot_etps, dist_to_tss >= 1000, dist_to_tss <= 300000)

# rearrange columns
columns <- c("perturbation", "gene", "avg_txs", "enh_chr", "enh_start", "enh_end", "gene_chr",
             "gene_tss", "gene_strand", "dist_to_tss", "Dnase-seq", "H3K27ac", "H3K27me3",
             "H3K4me1", "H3K4me3", "POLR2A", "dist_bins", "int_freq", "activity_score",
             "abc_score", "abc_score_norm")
chrom_annot_etps <- chrom_annot_etps[, columns]

# write to output file
write.csv(chrom_annot_etps, file = here(snakemake@output[[1]]), quote = FALSE, row.names = FALSE)

R tidyverse here Matrix ucsc-liftover GenomicAlignments HiTC From line 4 of chromatin_annotated_etps/gasperini_screen_etps.R

library(GenomicAlignments)
library(GenomicFeatures)
library(here)
library(HiTC)
library(Matrix)
library(rtracklayer)
library(tidyverse)
library(BiocParallel)

# backend for parallel computing
register(MulticoreParam(workers = snakemake@threads))

# Expressed genes ==================================================================================

# Whole-Tx CROP-seq data is used to find expressed genes in K562 cells

# import gencode hg38 annotations
annot_url <- "ftp://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.chr.gtf.gz"
annot <- import(annot_url, format = "gtf")

# extract exon annotations for protein-coding genes on autosomes
genes <- annot[annot$type == "exon" & annot$gene_biotype == "protein_coding" &
                 annot$transcript_biotype == "protein_coding" &
                 !seqnames(annot) %in% c("X", "Y", "MT")]

# genome-wide K562 expression data
dge <- read.table(here(snakemake@input$wholeTx_dge), header = TRUE, row.names = "GENE",
                  stringsAsFactors = FALSE)

# only retain dge data on protein-coding genes
gene_expr <- dge[rownames(dge) %in% genes$gene_name, ]

# compute average gene expression
avg_expr <- data.frame(gene = rownames(gene_expr), avg_txs = rowSums(gene_expr),
                       stringsAsFactors = FALSE, row.names = NULL)

# extract genes with 100 or more transcripts on average
expr_gene_names <- avg_expr %>% 
  filter(avg_txs >= 100) %>% 
  pull(gene)

# get annotations for these genes and split by gene name
expr_genes <- genes[genes$gene_name %in% expr_gene_names]
expr_genes <- split(expr_genes, f = expr_genes$gene_name)

# Select enhancers - target pairs ==================================================================

# Predicted enhancer sites are selected that are within 300kb of the expressed genes TSS

# load genome-wide enhancers
enh <- read.table(here(snakemake@input$enhancers), col.names = "enh_id", stringsAsFactors = FALSE)

# split enhancer id into chromosome, start and end coordinates
enh <- enh %>% 
  separate(col = enh_id, into = c("chr", "start", "end"), convert = TRUE, remove = FALSE) %>% 
  mutate(chr = sub("chr", "", chr))

# create GRanges object containing enhancer coordinates
enh_coords <- makeGRangesFromDataFrame(enh, keep.extra.columns = TRUE)

# function to get a genes TSS
get_tss <- function(x) {
  if (all(strand(x) == "+")) {
    tss <- min(start(x))
  }else if (all(strand(x) == "-")) {
    tss <- max(end(x))
  }else{
    stop("Inconsistent strand information!")
  }
  GRanges(seqnames = unique(seqnames(x)), ranges = IRanges(start = tss, end = tss),
          strand = unique(strand(x)), gene_name = unique(x$gene_name))
}

# get tss for all genes
gene_tss <- endoapply(expr_genes, FUN = get_tss)
gene_tss <- unlist(gene_tss)

# find enhancers that overlap a 600kb window centered on each genes tss
tss_windows <- promoters(gene_tss, upstream = 300000, downstream = 300000)
overlaps <- findOverlaps(query = enh_coords, subject = tss_windows)

# get overlapping gene and gene tss
ol_enh <- enh_coords[from(overlaps)]
ol_tss <- gene_tss[to(overlaps)]

# create data.frame containing etps
enh_df <- data.frame(ol_enh)
tss_df <- data.frame(ol_tss)
etps <- data.frame(perturbation = enh_df$enh_id, gene = tss_df$gene_name,
                   enh_chr = as.character(enh_df$seqnames), enh_start = enh_df$start,
                   enh_end = enh_df$end, gene_chr = as.character(tss_df$seqnames),
                   gene_tss = tss_df$start, gene_strand = as.character(tss_df$strand),
                   stringsAsFactors = FALSE)

# calculate distance to tss for every enhancer - gene pair
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# reformat chromosome ids
etps <- mutate(etps, enh_chr = paste0("chr", enh_chr), gene_chr = paste0("chr", gene_chr))

# Chromatin activity ===============================================================================

# Each ETP is annotated for chromatin activity at the involved enhancer. The RPKM is calculated to
# normalize for both the sequencing depth and enhancer size

# chromatin assays and input files
chrom_infiles <- here(snakemake@input$encode_bam)
names(chrom_infiles) <- sub("_.+", "", basename(chrom_infiles))

# load all bam files
chrom_reads <- bplapply(chrom_infiles, FUN = readGAlignments)

# extract hg38 coordinates of all enhancers
enh_coords <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# count the number of reads within each enhancer
enh_chrom_reads <- chrom_reads %>%
  bplapply(FUN = countOverlaps, query = enh_coords) %>%
  bind_cols()

# combine with enhancer coordinates and convert to long format
enh_chrom <- enh_coords %>%
  data.frame(stringsAsFactors = FALSE) %>%
  select(-strand) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  bind_cols(enh_chrom_reads) %>%
  gather(key = assay, value = reads, -c(1:5))

# normalize for sequencing depth and enhancer size by calculating rpkm
enh_chrom_norm <- enh_chrom %>%
  group_by(assay) %>%
  mutate(rpkm = reads / (sum(reads) / 1e6) / (width / 1000))

# convert to wide format add to enhancer gene pair results
etps <- enh_chrom_norm %>%
  select(-c(enh_chr, enh_start, enh_end, width, reads)) %>%
  spread(key = "assay", value = "rpkm") %>%
  left_join(x = etps, y = ., by = "perturbation")

# remove data to free up memory
rm(enh_chrom_reads, chrom_reads, enh_chrom, enh_chrom_norm)
rm(annot, dge, expr_genes, gene_expr, genes)
invisible(gc())

# Annotate Hi-C ====================================================================================

# Hi-C contact matrices from Rao et al., 2014 are imported and the contact frequency for each
# enhancer - gene pair is calculated

# liftover ETPs ------------------------------------------------------------------------------------

# Hi-C contact matrices are in hg19 annotation, so enhancer and gene TSS coordinates need to be
# lifted over from hg38 to hg19.

# download chain file for hg38 to hg19 liftover
chain_url <- "http://hgdownload.cse.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz"
chain_file <- tempfile("hg38ToHg19", fileext = ".gz")
download.file(chain_url, chain_file)
system(paste("gunzip", chain_file))

# import chain file
hg38_to_hg19_chain <- import.chain(tools::file_path_sans_ext(chain_file))

# extract hg38 coordinates of all enhancers
enh_coords_hg38 <- etps %>%
  select(enh_chr, enh_start, enh_end, perturbation) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# liftover enhancers from hg38 to hg19 and convert to data.frame
enh_coords_hg19 <- enh_coords_hg38 %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, end, perturbation) %>%
  rename(enh_chr = seqnames, enh_start = start, enh_end = end) %>%
  mutate(enh_chr = as.character(enh_chr))

# replace tested enhancer - gene pairs hg38 enhancer coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(enh_chr, enh_start, enh_end)) %>%
  left_join(enh_coords_hg19, by = "perturbation")

# extract gene TSS coordinates
gene_tss_coords <- etps %>%
  select(gene, gene_chr, gene_tss, gene_strand) %>%
  distinct() %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE, seqnames.field = "gene_chr",
                           start.field = "gene_tss", end.field = "gene_tss",
                           strand.field = "gene_strand")

# liftover tss coordinates to hg19
gene_tss_coords_hg19 <- gene_tss_coords %>%
  liftOver(chain = hg38_to_hg19_chain) %>%
  unlist() %>%
  as.data.frame() %>%
  select(seqnames, start, strand, gene) %>%
  rename(gene_chr = seqnames, gene_tss = start, gene_strand = strand) %>%
  mutate(gene_chr = as.character(gene_chr), gene_strand = as.character(gene_strand))

# replace gene tss hg38 coordinates with hg19 coordinates
etps <- etps %>%
  select(-c(gene_chr, gene_tss, gene_strand)) %>%
  left_join(gene_tss_coords_hg19, by = "gene")

# recalculate distance to tss for hg19 coordinates
etps <- etps %>%
  mutate(enh_center = round((enh_start + enh_end) / 2)) %>% 
  mutate(dist_to_tss = if_else(gene_strand == "+", true  = enh_center - gene_tss,
                               false = gene_tss - enh_center)) %>% 
  select(-enh_center)

# remove enhancers that were not lifted over or switched chromosomes...
etps <- filter(etps, enh_chr == gene_chr)

# Prepare Hi-C data --------------------------------------------------------------------------------

# Contact matrices for K562 cells from Rao et al. 2014 are imported (5kb resolution). The provided
# normalization vectors are used to normalize the observed contacts (read counts). Data from the two
# chromosonal regions on chromosome 8 and 11 is extracted for any further analyses.

# function to import HiC data from Rao et al for one chromosome and create a HTCexp object
import_hic <- function(sparse_cm_file, chromosome, resolution, bins) {

  # load sparse contact matrix file (only observed contacts)
  obs_contacts <- read.table(sparse_cm_file, col.names = c("i", "j", "M_ij"), sep = "\t")

  # get starting coordinates of assessed genomic bins at 5kb resolution
  max_bin <- (bins - 1) * resolution
  bin_starts <- seq(from = 0, to = max_bin, by = resolution)

  # create GRanges object containing all assessed bins for that chromosome
  bin_coords <- GRanges(seqnames = chromosome,
                        ranges = IRanges(start = bin_starts, end = bin_starts + resolution - 1,
                                         names = paste0("bin_", seq_len(length(bin_starts))))
  )

  # convert starting coordinates of bins in sparse matrix input to bin ids by dividing by the
  # resolution (and add 1 to get correct index)
  obs_contacts_bins <- data.frame(i = round(obs_contacts$i / resolution + 1),
                                  j = round(obs_contacts$j / resolution + 1), 
                                  M_ij = obs_contacts$M_ij)

  # create sparse contact matrix from observed contacts
  sparse_cm <- sparseMatrix(i = obs_contacts_bins$i, j = obs_contacts_bins$j,
                            x = obs_contacts_bins$M_ij, symmetric = TRUE, dims = c(bins, bins))

  # create HTCexp object containing data for the given chromosome
  HTCexp(intdata = sparse_cm, xgi = bin_coords, ygi = bin_coords)

}

# k562 intrachromosomal sparse matrix files
scm_files <- here(snakemake@input$hic_raw)
names(scm_files) <- sub("_5kb.RAWobserved", "", basename(scm_files))

# normalization vector files
krnorm_files <- here(snakemake@input$hic_norm)
names(krnorm_files) <- sub("_5kb.KRnorm", "", basename(krnorm_files))

# import normalization vectors
krnorm <- lapply(krnorm_files, FUN = function(x) as.numeric(readLines(x)) )

# infer number of bins per chromosome based on the normalization vectors
hic_bins <- lapply(krnorm, FUN = length)

# import hi-c data for these chromosomes
hic_data <- bpmapply(FUN = import_hic,
                     sparse_cm_file = scm_files,
                     chromosome = names(scm_files),
                     bins = hic_bins,
                     MoreArgs = list(resolution = 5000))

# Enhancer - gene pairs ----------------------------------------------------------------------------

# The interaction frequency of all tested enhancer - gene pair computed, defined as the interaction
# frequency of the genomic bins overlapping the enhancer and the target genes TSS.

# function to compute the interaction frequency for enhancer - gene pairs, by finding the hi-c 
# genomic bins with overlap with the enhancer and the target gene tss. the interaction frequency of
# the pair is then defined as the interaction frequency of these bins
compute_int_freq <- function(pairs, htc_object, norm_vector) {

  # add pair identifier
  pairs$pair <- seq_len(nrow(pairs))

  # get coordinates of enhancer centers as GRanges object
  enh_coords <- pairs %>%
    mutate(enh_center = round((enh_start + enh_end) / 2)) %>%
    select(enh_chr, enh_center) %>%
    makeGRangesFromDataFrame(., seqnames.field = "enh_chr", start.field = "enh_center",
                             end.field = "enh_center")

  # get gene tss coordinates as GRanges object
  tss_coords <- pairs %>%
    select(gene_chr, gene_tss) %>%
    makeGRangesFromDataFrame(., seqnames.field = "gene_chr", start.field = "gene_tss",
                             end.field = "gene_tss")

  # get bins overlapping for all enhancers and gene tss
  hic_bins <- x_intervals(htc_object)
  enh_bins <- findOverlaps(query = enh_coords, subject = hic_bins)
  tss_bins <- findOverlaps(query = tss_coords, subject = hic_bins)

  # combine into one data.frame
  enh_bins <- data.frame(pair = from(enh_bins), enh_bin = to(enh_bins))
  tss_bins <- data.frame(pair = from(tss_bins), tss_bin = to(tss_bins))
  bins <- full_join(enh_bins, tss_bins, by = "pair")

  # extract distance matrix between bins from htc object
  dists <- intervalsDist(htc_object)

  # calculate distances between bins of enhancer gene pairs
  dist_pairs <- dists[as.matrix(bins[, 2:3])]
  dist_pairs <- data.frame(pair = bins$pair, dist_bins = dist_pairs)

  # extract hi-c interaction matrix from htc object
  intdata <- intdata(htc_object)

  # get interaction frequencies and normalization factors for all bins
  int_freq_pairs <- intdata[as.matrix(bins[, 2:3])]

  # compute normalization factors
  norm_vector_values <- matrix(norm_vector[as.matrix(bins[, 2:3])], ncol = 2)
  norm_factors <- norm_vector_values[, 1] * norm_vector_values[, 2]

  # normalize data and add and add pair id
  int_freq_pairs <- int_freq_pairs / norm_factors
  int_freq_pairs <- data.frame(pair = bins$pair, int_freq = int_freq_pairs)

  # add interaction frequencies and bin distances to pairs to create output
  pairs %>%
    left_join(dist_pairs, by = "pair") %>%
    left_join(int_freq_pairs, by = "pair") %>%
    select(-c(pair))

}

# compute interaction frequencies for all tested enhancer - gene pairs
etps_chrs <- split(etps, f = etps$enh_chr)
etps_chrs <- etps_chrs[names(hic_data)]
krnorm <- krnorm[names(etps_chrs)]
int_freqs <- bpmapply(FUN = compute_int_freq, pairs = etps_chrs, htc_object = hic_data,
                      norm_vector = krnorm, SIMPLIFY = FALSE)

# combine into one data.frame
chrom_annot_etps <- bind_rows(int_freqs)

# Activity by contact ==============================================================================

# The activity by contact score is computed as proposed by Fulco et al. The geometric mean between
# H3K27ac and DNAse-seq is then calculatated for every site to compute an enhancer activity score
# proposed by Fulco et al. The ABC score is then computed by multiplying the activity score with the
# Hi-C interaction frequency.

# function to calculate geometric mean
gm_mean <- function(x, na.rm = TRUE){
  exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# compute enhancer activity score
chrom_annot_etps <- chrom_annot_etps %>%
  rowwise() %>%
  mutate(activity_score = gm_mean(c(`Dnase-seq`, H3K27ac))) %>%
  ungroup()

# compute abc score
chrom_annot_etps <- chrom_annot_etps %>%
  group_by(gene) %>%
  mutate(abc_score = activity_score * (int_freq + 1),
         abc_score_norm = abc_score / sum(abc_score))

# Finalize output ==================================================================================

# The output is finalized by adding the average gene expression, transforming the distance to TSS to
# absolute values and pruning pairs based on the distance to TSS.

# add average gene expression
chrom_annot_etps <- left_join(chrom_annot_etps, avg_expr, by = "gene")

# transform distance to TSS
chrom_annot_etps <- mutate(chrom_annot_etps, dist_to_tss = abs(dist_to_tss))

# only retain pairs with distance to TSS between 1 and 300 kb
chrom_annot_etps <- filter(chrom_annot_etps, dist_to_tss >= 1000, dist_to_tss <= 300000)

# rearrange columns
columns <- c("perturbation", "gene", "avg_txs", "enh_chr", "enh_start", "enh_end", "gene_chr",
             "gene_tss", "gene_strand", "dist_to_tss", "Dnase-seq", "H3K27ac", "H3K27me3",
             "H3K4me1", "H3K4me3", "POLR2A", "dist_bins", "int_freq", "activity_score",
             "abc_score", "abc_score_norm")
chrom_annot_etps <- chrom_annot_etps[, columns]

# write to output file
write.csv(chrom_annot_etps, file = here(snakemake@output[[1]]), quote = FALSE, row.names = FALSE)

R tidyverse here Matrix ucsc-liftover GenomicAlignments HiTC From line 4 of chromatin_annotated_etps/genome_wide_etps.R

sd=$(dirname $0)

# input directory passed as argument
indir=$1

# output files passed as arguments
r1_outfile=$2
r2_outfile=$3

# delete potentially existing output files to avoid attaching to them
if [ -f $r1_outfile ]; then
  rm $r1_outfile
fi

if [ -f $r2_outfile ]; then
  rm $r2_outfile
fi

# BARCODE READS ------------------------------------------------------------------------------------

# function to get i7 barcode from barcode table, add it to the beginning of every barcode read and
# append output to compressed fastq file
process_read1 () {
  local indir=$1
  local outfile=$2
  local i7_file=$3
  for i in ${indir}/*_1.fastq.gz; do
    bn=$(basename $i)
    sample=$(echo $bn | perl -pe 's/(\w+)_1.fastq.gz/$1/')
    i7=$(awk -v sample="$sample" '$1 == sample{print $2}' $i7_file)
    echo -e "\tprocessing ${i} (i7 = ${i7})..."
    zcat $i | sed "2~4s/^/${i7}/" | sed '4~4s/^/IIIIIIII/' | gzip >> $outfile
  done
}

# add all genome reads into one fastq file
echo "Merging barcode reads:"
process_read1 $indir $r1_outfile ${sd}/i7_barcodes.txt

# GENOME READS -------------------------------------------------------------------------------------

# function to simply append all genome reads to one compressed fastq file
process_read2() {
  local indir=$1
  local outfile=$2
  for i in ${indir}/*_2.fastq.gz; do
    echo -e "\tprocessing ${i}..."
    cat $i >> $outfile
  done
}

# add all genome reads into one fastq file
echo -e "\nMerging genome reads:"
process_read2 $indir $r2_outfile

echo -e "\nDone!"

Shell From line 8 of download_fastq/merge_fastq_i7.sh

sd=$(dirname $0)

# input directory passed as argument
indir=$1

# output files passed as arguments
r1_outfile=$2
r2_outfile=$3

# delete potentially existing output files to avoid attaching to them
if [ -f $r1_outfile ]; then
  rm $r1_outfile
fi

if [ -f $r2_outfile ]; then
  rm $r2_outfile
fi

# function to simply append all genome reads to one compressed fastq file
process_read() {
  local indir=$1
  local outfile=$2
  local read=$3
  for i in ${indir}/*_${read}.fastq.gz; do
    echo -e "\tprocessing ${i}..."
    cat $i >> $outfile
  done
}

echo "Merging barcode reads:"
process_read $indir $r1_outfile "1"

echo -e "\nMerging genome reads:"
process_read $indir $r2_outfile "2"

echo -e "\nDone!"

Shell From line 8 of download_fastq/merge_fastq.sh

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# load required functions and packages
library(here)
source(file.path(snakemake@scriptdir, "align_report_fun.R"))

R Markdown here From line 15 of processing/align_report.Rmd

# calculate summary statistics
bc_stats <- bc_summary(here(snakemake@input$cell_bcs), here(snakemake@input$mol_bcs))

# print table
knitr::kable(bc_stats)

R Markdown From line 32 of processing/align_report.Rmd

# extract total number of reads from STAR log output
input_reads  <- readLines(here(snakemake@input$star_smry), n = 6)[6]
total_reads <- as.numeric(unlist(strsplit(input_reads, split = "\t"))[2])

# plot adapter trimming summary
adapter_trim_hist(adapter_trim_file = here(snakemake@input$adapt_trim),
                  sample = snakemake@wildcards$sample,
                  total_reads = total_reads)

R Markdown STAR From line 51 of processing/align_report.Rmd

# plot polyA trimming summary
polyA_trim_hist(polyA_trim_file = here(snakemake@input$polyA_trim),
                  sample = snakemake@wildcards$sample,
                  total_reads = total_reads)

R Markdown From line 64 of processing/align_report.Rmd

# extract percentage of reads across mapping cats
perc_reads <- get_mapping_cats(here(snakemake@input$star_smry))

# plot percentage of reads
plot_mapping_cats(mapping_cats = perc_reads, sample = snakemake@wildcards$sample,
                  total_reads = total_reads)

R Markdown From line 79 of processing/align_report.Rmd

# get sample id
sample <- snakemake@wildcards$sample

# extract expected cell number from snakemake object
expect_cells <- as.numeric(snakemake@config$cell_numbers[sample])

# compute the reads per cell barcode distribution for the top x cells, where
# x is the number of expected cells * 2 and compute local minimum in
# bimodal distribution
rpc_dens <- rpc_density(here(snakemake@input$reads_per_cell), expect_cells = expect_cells)

# define plot title
dens_title <- paste(sample, "reads per cell barcode (top", expect_cells * 2, "cells)")

# create interactive plot
plot_rpc_density(rpc_dens, title = dens_title)

R Markdown Snakemake From line 105 of processing/align_report.Rmd

# extract estimated number of cells
ncells <- rpc_dens$ncells

# calculate cumulative fraction
rpc <- cumfrac_rpc(here(snakemake@input$reads_per_cell))

# define plot title
title <- paste0(sample, " cumulative reads (total reads: ", total_reads, ")")

# plot cumulative fraction plot
plot_cumfrac_rpc(rpc, nbcs = expect_cells * 10, title = title, ncells = ncells)

R Markdown From line 135 of processing/align_report.Rmd

library(optparse)

# create arguments list
option_list = list(
  make_option(c("-t", "--txs_file"), type = "character", default = NULL,
              help = "Path to transcripts annotation file (.gtf)",
              metavar = "character"),
  make_option(c("-b", "--BSgenome_id"), type = "character", default = NULL,
              help = "Identifier for used BSgenome", metavar = "character"),
  make_option(c("-f", "--fasta_outfile"), type = "character", default = NULL,
              help = "Path to .fasta output file", metavar = "character"),
  make_option(c("-o", "--gtf_outfile"), type = "character", default = NULL,
              help = "Path to .gtf output file", metavar = "character")
)

# parse arguments
opt_parser = OptionParser(option_list = option_list)
opt = parse_args(opt_parser)

# function to check for required arguments
check_required_args <- function(arg, opt, opt_parser) {
  if (is.null(opt[[arg]])) {
    print_help(opt_parser)
    stop(arg, " argument is required!", call. = FALSE)
  }
}

# check that all required parameters are provided
required_args <- c("txs_file", "BSgenome_id", "fasta_outfile", "gtf_outfile")
for (i in required_args) {
  check_required_args(i, opt = opt, opt_parser = opt_parser)
}

# define functions =================================================================================

# attach required packages
library(rtracklayer)
library(BSgenome)
library(GenomicRanges)
library(GenomicFeatures)

# main function to create gene loci tap-seq alignment reference based on input file paths
create_tapseq_ref <- function(txs_file, BSgenome_id, fasta_outfile, gtf_outfile) {

  message("\nLoading data...")

  # load transcripts
  txs <- import(txs_file, format ="gtf")

  # get BSgenome object containing the genome sequence
  genome <- getBSgenome(BSgenome_id)
  seqlevelsStyle(genome) <- seqlevelsStyle(txs)[1]  # set seqlevels to same style as txs input

  # create tap-seq alignment references
  message("Creating tap-seq alignment reference...")
  ref <- tapseq_alignment_ref(txs, genome = genome)

  # save alignment reference data
  writeXStringSet(ref$contig_seqs, filepath = fasta_outfile, format = "fasta")
  export_gtf(ref$annot, filepath = gtf_outfile)

  message("Done!")

}

# helper functions to create tap-seq alignment reference data ======================================

# create alignment reference data for gene loci
tapseq_alignment_ref <- function(txs, genome, gene_id = "gene_name") {

  # convert txs to GRanges if it's a GRangesList
  if (is(txs, "GRangesList")) {
    txs <- unlist(txs)
  }

  # abort if genome is not a BSgenome or DNAStringSet object
  if (!any(is(genome, "BSgenome") | is(genome, "DNAStringSet"))) {
    stop("genome must be of class BSgenome or DNAStringSet!", call. = FALSE)
  }

  # sort txs by coordinates
  txs <- sort(txs)

  # group transcripts by gene names
  genes <- split(txs, f = mcols(txs)[, gene_id])

  # create contigs for each locus ------------------------------------------------------------------

  # obtain ranges of each gene locus (start of gene to end of gene)
  loci <- range(genes)

  # merge loci to create non-overlapping contigs
  contigs <- reduce(unlist(loci), ignore.strand = TRUE)
  strand(contigs) <- "+"

  # split contigs into GRangesList
  contigs <- split(contigs, f = 1:length(contigs))

  # find genes overlapping with each contig
  overlaps <- as.data.frame(findOverlaps(contigs, genes, ignore.strand = TRUE))

  # split by query hit to have all overlapping genes per contig in one table
  overlaps <- split(overlaps, f = overlaps$queryHits)

  # create annotations for genes overlapping each contig
  annot <- lapply(overlaps, FUN = create_contig_annot, contigs = contigs, genes = genes)
  annot <- unlist(GRangesList(annot))
  names(annot) <- NULL

  # get contig names based on names of genes overlapping each contig
  names(contigs) <- vapply(overlaps, FUN = contig_names, genes = genes, FUN.VALUE = character(1))

  # get sequences for contigs
  contig_seqs <- extractTranscriptSeqs(genome, transcripts = contigs)

  # create and return output
  list(contig_seqs = contig_seqs, annot = annot)

}

# create annotations for one contig
create_contig_annot <- function(overlap, contigs, genes) {

  # get contig and overlapping genes
  contig <- contigs[[unique(overlap$queryHits)]]
  ol_genes <- unlist(genes[overlap$subjectHits])

  # convert genome coordinates of overlapping genes into coordinates relative
  # to the contig
  start(ol_genes) <- start(ol_genes) - start(contig) + 1
  end(ol_genes) <- end(ol_genes) - start(contig) + 1

  # create new contig name to use as seqnames of output annotations
  gene_names <- names(genes[overlap$subjectHits])
  contig_name <- paste(gene_names, collapse = "_")

  # create new ouptut with contig name as seqnames
  annot <- GRanges(seqnames = contig_name, ranges = ranges(ol_genes), strand = strand(ol_genes))

  # add metadata and return output
  mcols(annot) <- mcols(ol_genes)
  annot

}

# create contig names by exracting the names of all overlapping genes per contig
contig_names <- function(overlap, genes) {

  # get gene names of overlapping genes
  gene_names <- names(genes[overlap$subjectHits])

  # create contig name
  paste(gene_names, collapse = "_")

}

# export GRanges object to gtf file. rtracklayers 'export()' function causes an error with STAR 
export_gtf <- function(x, filepath) {

  # transform x into data.frame
  x <- data.frame(x)

  # transform all columns to character
  x[] <- lapply(x, FUN = as.character)

  # remove the width column (from converting GRanges to data.frame)
  x <- x[,-which(colnames(x) == "width")]

  # replace potential NA for score and phase by '.'
  x$score[is.na(x$score)] <- "."
  x$phase[is.na(x$phase)] <- "."

  # replace potential '*' for strand by '.'
  x$strand[x$strand == "*"] <- "."

  # get first 8 fields for gtf output
  gtf_fields <- x[,c("seqnames", "source", "type", "start",
                     "end", "score", "strand", "phase")]

  ## the remaining fields are processed into the attribute field

  # get remaining fields
  attr_cols <- x[,colnames(x)[!colnames(x) %in% colnames(gtf_fields)]]

  # get attribute names
  attr_names <- colnames(attr_cols)

  # get mandatory attributes "gene_id" and "transcript_id"
  mand <- c(which(attr_names == "gene_id"),
            which(attr_names == "transcript_id"))

  # abort if mandatory fields not found
  if (length(mand) != 2) {

    stop("Mandatory attributes \"gene_id\" and \"transcript_id\" not found!",
         call. = FALSE)

  }

  # order attributes so that the mandatory attributes come first
  attr_names <- c(attr_names[mand], attr_names[-mand])
  attr_cols <- attr_cols[attr_names]

  # create matrix with names
  attr_names <- matrix(rep(attr_names, nrow(attr_cols)),
                       ncol=length(attr_names), byrow = TRUE)

  # paste attribute names to values
  attr <- matrix(paste(attr_names, as.matrix(attr_cols)),
                 nrow = nrow(attr_names))

  # collapse all attributes into one string to create the attribute field
  # and add to gtf_fields
  gtf_fields$attributes <- do.call(paste, c(as.data.frame(attr), sep="; "))
  gtf_fields$attributes <- paste0(gtf_fields$attributes, ';')

  # write output to file
  write.table(gtf_fields, file = filepath, row.names = FALSE,
              col.names = FALSE, quote = FALSE, sep = "\t") 

}

# create tap-seq alignment references ==============================================================

create_tapseq_ref(txs_file = opt$txs_file, BSgenome_id = opt$BSgenome_id,
                  fasta_outfile = opt$fasta_outfile, gtf_outfile = opt$gtf_outfile)

R STAR optparse GenomicRanges BSgenome From line 7 of processing/create_tapseq_annot.R

import argparse
from Bio import SeqIO

# define functions to generate reference data ------------------------------------------------------

# function to generate alignment reference data from a fasta file containing vector sequences
def cropseq_alignment_reference(input_fasta, output_bn, fasta_ext = ".fasta", prefix = ""):

  # output files
  fasta_outfile = output_bn + fasta_ext
  gtf_outfile = output_bn + ".gtf"

  # open output files
  output_fasta = open(fasta_outfile, "w")
  output_gtf  = open(gtf_outfile,  "w")

  # process each input sequence
  for record in SeqIO.parse(input_fasta, "fasta"):

    # add prefix to sequence id
    record.id = prefix + record.id

    # remove name and description to remove them from header in fasta output
    record.name = ""
    record.description = ""

    # create gtf entry
    gtf = gtf_entry(record)

    # write gtf entry and modified fasta record to respective output files
    output_gtf.write("%s\n" % gtf)
    SeqIO.write(record, output_fasta, "fasta")

  # close open files
  output_fasta.close()
  output_gtf.close()

# function to create gtf entry from a crop-seq vector fasta record
def gtf_entry(fasta_record):

  # get sequence name and length
  seq_name = fasta_record.id
  seq_len = len(fasta_record.seq)

  # create gtf attribute field
  attr_names = ["gene_id", "transcript_id", "gene_name", "transcript_name"]
  attr = [s + " " + seq_name for s in attr_names]
  attr = "; ".join(attr) + ";"

  # create gtf entry
  gtf_fields = [seq_name, "VECTOR", "exon", "1", str(seq_len), ".", "+", ".", attr]
  gtf_line = "\t".join(gtf_fields)

  return gtf_line

# create reference files ---------------------------------------------------------------------------

# create crop-seq vector references based on command line arguments if the  is called as main
# program
if __name__ == "__main__":

  # parse command line arguments
  parser = argparse.ArgumentParser(description = ("Create CROP-seq vector "
                                    "alignment references"))
  parser.add_argument("-i", "--input_fasta", type = str, required = True,
                      help = ("Input fasta file containing sequences of CROP-seq vectors."))
  parser.add_argument("-o", "--output_bn", type = str, required = True,
                      help = "Basename for output files.")
  parser.add_argument("--fasta_ext", type = str, default = ".fasta",
                      help = "Filename extension of fasta files "
                      "(default: .fasta).")
  parser.add_argument("--prefix", type = str, default = "",
                      help = "Optional prefix to be added to sequence name.")
  args = parser.parse_args()

  # create crop-seq vector reference
  cropseq_alignment_reference(input_fasta = args.input_fasta,
                              output_bn = args.output_bn,
                              fasta_ext = args.fasta_ext,
                              prefix = args.prefix)

Python Biopython From line 6 of processing/cropseq_vector_reference.py

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# load required packages
library(tidyverse)
library(here)
library(plotly)

R Markdown tidyverse here plotly From line 15 of processing/dge_report_no_perts.Rmd

# read tpt histogram data
tpt_hist <- read.table(here(snakemake@input$tpt_hist), header = TRUE, stringsAsFactors = FALSE)

# bin tpt values into better intervals for plotting
bins <- seq(0, 1, by = 0.1)
tpt_hist_plot <- tpt_hist %>%
  mutate(tpt_bin = cut(tpt, bins)) %>%
  group_by(tpt_bin) %>%
  summarize(transcripts = sum(transcripts))

# plot histogram
ggplot(tpt_hist_plot, aes(x = tpt_bin, y = transcripts)) +
  geom_bar(stat = "identity") +
  labs(x = "Transcripts-per-transcripts", y = "Transcripts", title = "TPT distribution") +
  theme_bw()

R Markdown ggplot2 From line 41 of processing/dge_report_no_perts.Rmd

# read dge summary statistics
dge_stats <- read.table(here(snakemake@input$dge_stats), header = TRUE, stringsAsFactors = FALSE)

# sort dge_stats according to number of detected transcripts and transform cell barcode into factor
dge_stats <- arrange(dge_stats, desc(transcripts)) %>%
  mutate(cell_barcode = forcats::fct_inorder(cell_barcode))

# calculate number of cell barcodes lost because of cell barcode whitelist filtering (for text only)
ncells <- snakemake@config$cell_numbers[[snakemake@wildcards$sample]]
filt_cells <- ncells - nrow(dge_stats)

R Markdown From line 65 of processing/dge_report_no_perts.Rmd

# number of transcripts
ggplot(dge_stats, aes(x = cell_barcode, y = transcripts)) +
  geom_bar(stat = "identity") + 
  labs(title = paste(snakemake@wildcards$sample, "number of transcripts")) +
  theme_bw() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), panel.grid = element_blank())

# number of genic reads
ggplot(dge_stats, aes(x = cell_barcode, y = genic_reads)) + 
  geom_bar(stat = "identity") + 
  labs(title = paste(snakemake@wildcards$sample, "number of genic reads")) +
  theme_bw() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), panel.grid = element_blank())

# number of detected genes
ggplot(dge_stats, aes(x = cell_barcode, y = genes)) + 
  geom_bar(stat = "identity") + 
  labs(title = paste(snakemake@wildcards$sample, "number of detected genes")) +
  theme_bw() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), panel.grid = element_blank())

R Markdown ggplot2 From line 86 of processing/dge_report_no_perts.Rmd

# create scatter plot with plotly for small datasets or ggplot for larget datasets (>20k cells)
if (nrow(dge_stats) <= 20000) {

  # create scatter plot
  p <- plot_ly(dge_stats) %>%
    add_trace(x = ~transcripts, y = ~genic_reads, type = "scatter", mode = "markers",
              marker = list(opacity = 0.75, line = list(width = 2)), hoverinfo = "text",
              text = ~paste0("Cell barcode: ", cell_barcode, "\n",
                             "Genic reads: ", genic_reads, "\n",
                             "Transcripts: ", transcripts))

  # define axes and title
  xaxis <- list(title = "Number of detected transcripts (UMIs)", type = "log")
  yaxis <- list(title = "Number of genic reads", type = "log")
  title <- "Genic reads vs. transcripts"

  # add layout and print plot
layout(p, xaxis = xaxis, yaxis = yaxis, title = title)

}else{

  # create scatter plot
  ggplot(dge_stats, aes(x = transcripts, y = genic_reads)) +
    geom_abline() +
    geom_point(color = "#1f77b4") +
    labs(x = "Number of detected transcripts (UMIs)", y = "Number of genic reads",
         title = "Genic reads vs. transcripts") +
    scale_x_log10() +
    scale_y_log10() +
    theme_bw()

}

R Markdown ggplot2 plotly From line 117 of processing/dge_report_no_perts.Rmd

# add index for plotting
dge_stats$index <- 1:nrow(dge_stats)

# create scatter plot with plotly for small datasets or ggplot for larget datasets (>20k cells)
if (nrow(dge_stats) <= 20000) {

  # create plot
  p <- plot_ly(dge_stats) %>%
    add_trace(x = ~index, y = ~transcripts, type = "scatter", mode = "markers",
              marker = list(color = "darkgray"), name = "Cells", hoverinfo = "none") %>%
    add_trace(x ~index, y = ~transcripts, type = "scatter", mode = "lines",
              line = list(color = "#1f77b4"), name = "transcripts per cell",
              hoverinfo = "text", text = ~paste0("Cells: ", index, "\n",
                              "UMIs cell i: ", transcripts))

  # define axes and title
  xaxis <- list(title = "Cell barcodes sorted by number of transcripts [descending]")
  yaxis <- list(title = "Number of transcripts", type = "log")
  title <- "Number of transcripts per cell"

  # add layout and print plot
  layout(p, xaxis = xaxis, yaxis = yaxis, title = title)

}else{

  ggplot(dge_stats, aes(x = index, y = transcripts)) +
    geom_point(color = "darkgray") +
    geom_line(color = "#1f77b4") +
    labs(x = "Cell barcodes sorted by number of transcripts [descending]",
         y = "Number of transcripts", title = "Number of transcripts per cell") +
    scale_y_log10() +
    theme_bw()

}

R Markdown ggplot2 plotly From line 159 of processing/dge_report_no_perts.Rmd

# set global chunk options
knitr::opts_chunk$set(echo = FALSE)

# load required packages
library(tidyverse)
library(here)
library(plotly)

R Markdown tidyverse here plotly From line 15 of processing/dge_report.Rmd

# read tpt histogram data
tpt_hist <- read.table(here(snakemake@input$tpt_hist), header = TRUE, stringsAsFactors = FALSE)

# bin tpt values into better intervals for plotting
bins <- seq(0, 1, by = 0.1)
tpt_hist_plot <- tpt_hist %>%
  mutate(tpt_bin = cut(tpt, bins)) %>%
  group_by(tpt_bin) %>%
  summarize(transcripts = sum(transcripts))

# plot histogram
ggplot(tpt_hist_plot, aes(x = tpt_bin, y = transcripts)) +
  geom_bar(stat = "identity") +
  labs(x = "Transcripts-per-transcripts", y = "Transcripts", title = "TPT distribution") +
  theme_bw()

R Markdown ggplot2 From line 41 of processing/dge_report.Rmd

# read dge summary statistics
dge_stats <- read.table(here(snakemake@input$dge_stats), header = TRUE, stringsAsFactors = FALSE)

# sort dge_stats according to number of detected transcripts and transform cell barcode into factor
dge_stats <- arrange(dge_stats, desc(transcripts)) %>%
  mutate(cell_barcode = forcats::fct_inorder(cell_barcode))

# calculate number of cell barcodes lost because of cell barcode whitelist filtering (for text only)
ncells <- snakemake@config$cell_numbers[[snakemake@wildcards$sample]]
filt_cells <- ncells - nrow(dge_stats)

R Markdown From line 65 of processing/dge_report.Rmd

# number of transcripts
ggplot(dge_stats, aes(x = cell_barcode, y = transcripts)) +
  geom_bar(stat = "identity") + 
  labs(title = paste(snakemake@wildcards$sample, "number of transcripts")) +
  theme_bw() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), panel.grid = element_blank())

# number of genic reads
ggplot(dge_stats, aes(x = cell_barcode, y = genic_reads)) + 
  geom_bar(stat = "identity") + 
  labs(title = paste(snakemake@wildcards$sample, "number of genic reads")) +
  theme_bw() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), panel.grid = element_blank())

# number of detected genes
ggplot(dge_stats, aes(x = cell_barcode, y = genes)) + 
  geom_bar(stat = "identity") + 
  labs(title = paste(snakemake@wildcards$sample, "number of detected genes")) +
  theme_bw() +
  theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), panel.grid = element_blank())

R Markdown ggplot2 From line 86 of processing/dge_report.Rmd

# create scatter plot with plotly for small datasets or ggplot for larget datasets (>20k cells)
if (nrow(dge_stats) <= 20000) {

  # create scatter plot
  p <- plot_ly(dge_stats) %>%
    add_trace(x = ~transcripts, y = ~genic_reads, type = "scatter", mode = "markers",
              marker = list(opacity = 0.75, line = list(width = 2)), hoverinfo = "text",
              text = ~paste0("Cell barcode: ", cell_barcode, "\n",
                             "Genic reads: ", genic_reads, "\n",
                             "Transcripts: ", transcripts))

  # define axes and title
  xaxis <- list(title = "Number of detected transcripts (UMIs)", type = "log")
  yaxis <- list(title = "Number of genic reads", type = "log")
  title <- "Genic reads vs. transcripts"

  # add layout and print plot
layout(p, xaxis = xaxis, yaxis = yaxis, title = title)

}else{

  # create scatter plot
  ggplot(dge_stats, aes(x = transcripts, y = genic_reads)) +
    geom_abline() +
    geom_point(color = "#1f77b4") +
    labs(x = "Number of detected transcripts (UMIs)", y = "Number of genic reads",
         title = "Genic reads vs. transcripts") +
    scale_x_log10() +
    scale_y_log10() +
    theme_bw()

}

R Markdown ggplot2 plotly From line 117 of processing/dge_report.Rmd

# add index for plotting
dge_stats$index <- 1:nrow(dge_stats)

# create scatter plot with plotly for small datasets or ggplot for larget datasets (>20k cells)
if (nrow(dge_stats) <= 20000) {

  # create plot
  p <- plot_ly(dge_stats) %>%
    add_trace(x = ~index, y = ~transcripts, type = "scatter", mode = "markers",
              marker = list(color = "darkgray"), name = "Cells", hoverinfo = "none") %>%
    add_trace(x ~index, y = ~transcripts, type = "scatter", mode = "lines",
              line = list(color = "#1f77b4"), name = "transcripts per cell",
              hoverinfo = "text", text = ~paste0("Cells: ", index, "\n",
                              "UMIs cell i: ", transcripts))

  # define axes and title
  xaxis <- list(title = "Cell barcodes sorted by number of transcripts [descending]")
  yaxis <- list(title = "Number of transcripts", type = "log")
  title <- "Number of transcripts per cell"

  # add layout and print plot
  layout(p, xaxis = xaxis, yaxis = yaxis, title = title)

}else{

  ggplot(dge_stats, aes(x = index, y = transcripts)) +
    geom_point(color = "darkgray") +
    geom_line(color = "#1f77b4") +
    labs(x = "Cell barcodes sorted by number of transcripts [descending]",
         y = "Number of transcripts", title = "Number of transcripts per cell") +
    scale_y_log10() +
    theme_bw()

}

R Markdown ggplot2 plotly From line 159 of processing/dge_report.Rmd

# load dge data
dge <- read.table(here(snakemake@input$dge), header = TRUE, stringsAsFactors = FALSE)

# extract vector transcripts
vctr_dge <- dge %>%
  filter(grepl(GENE, pattern = snakemake@params$vector_prefix)) %>%
  gather(key = "cell", value = "txs", -GENE)

# count number of total vector molecules per cell
vctr_txs_per_cell <- vctr_dge %>%
  group_by(cell) %>%
  summarize(vctr_txs = sum(txs))

# plot number of transcripts per cell distribution
ggplot(vctr_txs_per_cell, aes(vctr_txs)) +
  geom_histogram(bins = 25) +
  labs(x = "Vector transcripts", y = "Cells", title = "Vector transcripts per cell") +
  theme_bw()

R Markdown ggplot2 From line 205 of processing/dge_report.Rmd

# load perturbation matrix
perturb <- read.table(here(snakemake@input$perturb_stats), header = TRUE, stringsAsFactors = FALSE)

# convert to long format
perturb <- perturb %>%
  gather(key = "cell", value = "pert", -VECTOR)

# compute the number of perturbations per cell
pert_per_cell <- perturb %>%
  group_by(cell) %>%
  summarize(pert_per_cell = sum(pert)) %>%
  count(pert_per_cell)

# plot number of perturbations per cell
ggplot(pert_per_cell, aes(x = pert_per_cell, y = n)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = 0) +
  labs(x = "Perturbations", y = "Cells", title = "Perturbations per cell") +
  theme_bw()

R Markdown ggplot2 From line 232 of processing/dge_report.Rmd

# calculate number of cells per perturbation
cells_per_pert <- perturb %>%
  group_by(VECTOR) %>%
  summarize(cells = sum(pert))

# plot number of cell per perturbation
p <- ggplot(cells_per_pert,
            aes(x = forcats::fct_reorder(VECTOR, .x = cells, .desc = TRUE), y = cells)) +
  geom_bar(stat = "identity") +
  theme_bw() +
  labs(x = "Perturbation", y = "Cells", title = "Number of cells per perturbation") +
  theme_bw()

# print plot with layout appropriate for number of perturbations
if (nrow(cells_per_pert) <= 30) {
  p + coord_flip()
}else{
  p + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(), panel.grid = element_blank())
}

R Markdown ggplot2 From line 260 of processing/dge_report.Rmd

from __future__ import print_function
import sys, argparse, os, gzip
import pandas as pd
import numpy as np

# define functions ---------------------------------------------------------------------------------

# helper function to print to stderr
def eprint(*args, **kwargs):
  print(*args, file = sys.stderr, **kwargs)

# filter for cell barcodes on whitelist
def filter_cell_barcodes(umi_obs, whitelist):

  # get umi observations on whitelist
  bc_filter = umi_obs['Cell Barcode'].isin(whitelist)
  filt_umi_obs = umi_obs[bc_filter]

  # get number of filtered out reads and barcodes
  n_filt_bc = len(umi_obs[~bc_filter]['Cell Barcode'].unique().tolist())
  n_filt_reads = umi_obs[~bc_filter]['Num_Obs'].sum()
  perc_filt_reads = n_filt_reads / umi_obs['Num_Obs'].sum() * 100

  # report filtered barcodes and reads to stderr
  eprint(' Removed ' + str(n_filt_reads) + ' reads (' + str(round(perc_filt_reads, 3)) + '%) ' +
         'from ' + str(n_filt_bc) + ' cell barcodes')

  return(filt_umi_obs)

# sample a specified number of total genic reads from the umi counts
def sample_reads(umi_obs, n, seed = None):

  # repeat each bc-umi-gene combination by the number of observations to recreate pool of reads
  reads = umi_obs.reindex(umi_obs.index.repeat(umi_obs.Num_Obs))

  # randomly sample n reads
  sample_reads = reads.sample(n = n, replace = False, random_state = seed)

  # count the number of times each read (bc-umi-gene combination) is found in the downsampled data
  read_counts = sample_reads.groupby(['Cell Barcode', 'Gene', 'Molecular_Barcode']).size()

  # reset index to "ungroup" data frame and rename counts column
  read_counts = read_counts.reset_index().rename(columns = {0:'Num_Obs'})

  return(read_counts)

# calculate transcripts per transcripts from umi observations (output from  Drop-seq tool:
# GatherMolecularBarcodeDistributionByGene)
def calculate_tpt(umi_obs):

  # calculate the total number of reads per cell-umi barcode combination
  reads = umi_obs.groupby(['Cell Barcode', 'Molecular_Barcode'])['Num_Obs'].sum()

  # convert to DataFrame with barcodes as columns and rename Num_obs column
  reads = pd.DataFrame(reads).reset_index()
  reads = reads.rename(columns = {'Num_Obs': 'total_reads'})

  # add total reads to umi_counts
  umi_obs = pd.merge(umi_obs, reads, how = 'left', on = ['Cell Barcode', 'Molecular_Barcode'])

  # calculate the fraction of total reads for each gene-cell-umi combination, where total reads is
  # the sum of reads for that cell-umi combination. this metric is also called transcripts per
  # transcript (tpt).
  umi_obs['tpt'] = np.divide(umi_obs['Num_Obs'], umi_obs['total_reads'])
  umi_tpt = umi_obs.drop(columns = 'total_reads')

  return(umi_tpt)

# compute tpt histogram from calculate_tpt() output
def compute_tpt_histogram(umi_tpt):

  # calculate number of bc-umi-gene combinations per tpt value
  tpt_hist = pd.value_counts(umi_tpt.tpt).to_frame().reset_index()

  # rename columns and sort according to tpt value
  tpt_hist.columns = ['tpt','transcripts']
  tpt_hist = tpt_hist.sort_values('tpt', ascending = False)

  return(tpt_hist)

# filter umi observations based on minimum tbt value
def filter_tpt(umi_tpt, tpt_threshold):

  # retain observations with tbt >= tbt_threshold
  filter_logical = umi_tpt['tpt'] >= tpt_threshold
  umi_filt = umi_tpt[filter_logical]

  eprint('  Removed ' + str(np.round(100 * (1 - np.mean(filter_logical)), 3)) + '% of transcripts')

  return(umi_filt)

# calculate dge summary based of umi observations. calculates number of genic reads, molecules and
# detected genes per cell
def dge_summary(umi_obs):

  # calculate number of genic reads, detected transcripts and genes per cell barcode
  agg_funs = {'Num_Obs': ['sum', 'count'], 'Gene' : 'nunique'}
  stats = umi_obs.groupby('Cell Barcode').agg(agg_funs)

  # sort according to number of genic reads
  stats = stats.sort_values([('Num_Obs', 'sum')], ascending = False)

  # make cell barcode a regular column and set new column names
  stats = stats.reset_index()
  stats.columns = ['cell_barcode', 'genic_reads', 'transcripts', 'genes']

  return(stats)

# convert umi observations 
def create_dge_matrix(umi_obs):

  # count number of umis per cell barcode and gene
  umi_counts = umi_obs.groupby(['Cell Barcode', 'Gene']).size()

  # convert to wide format ('unstack') with missing values set to 0 to create dge expression matrix
  dge_matrix = umi_counts.unstack(level = 0, fill_value = 0)

  # rename 'Gene' index to 'GENE' to mirror Drop-seq tools DGE output
  dge_matrix.index.rename('GENE', inplace = True)

  return(dge_matrix)

# execute if run as main program -------------------------------------------------------------------

# parse command line arguments
if __name__ == '__main__':

  # function to check if an argument contains None or a positive integer
  def none_or_int(arg):
    if arg == 'None':
      return(None)
    elif arg.isdigit():
      return(int(arg))
    else:
      msg = "invalid None or int value: %r" % arg
      raise argparse.ArgumentTypeError(msg)

  parser = argparse.ArgumentParser(description = 'Extract DGE data from UMI observations. This \
    program allows for downsampling of total genic reads and filtering for potential chimeric \
    reads. It returns a text file containing the gene expression matrix.')

  parser.add_argument('-i', '--inputfile', help = 'File containing UMI observations as produced by \
    GatherMolecularBarcodeDistributionByGene from Drop-seq tools.', required = True)
  parser.add_argument('-o', '--outputfile', help = 'Output file for filtered digital gene \
    expression matrix. TPT histogram and DGE summary files will be written into the same directory',
    required = True)
  parser.add_argument('-w', '--whitelist', help = 'File containing possible true cell barcodes \
    without header and one cell barcode per line. If provided only cell barcodes on this \
    whitelist will be considered. Default = None, which turns cell barcode filtering off.',
    default = None)
  parser.add_argument('--tpt_threshold', help = 'Minimum transcript per transcript value for \
    gene-cell-umi combinations to pass chimeric reads filtering. Default = 0, which does not \
    remove any reads', type = float, default = 0.0)
  parser.add_argument('--sample', help = 'Number of genic reads (int) to be drawn from input for \
    downsampling. Default = None, which turns sampling off.', type = none_or_int, default = 0)
  parser.add_argument('--seed', help = 'Seed (int) for the random number generator used when \
    downsampling. Default = None', type = none_or_int, default = None)

  args = parser.parse_args()

  # read umi observations input file
  eprint('Reading input file...')
  umi_obs = pd.read_csv(args.inputfile, sep = '\t')

  # filter for cell barcodes on whitelist (if whitelist file is provided)
  if args.whitelist:
    eprint('Filtering for cell barcodes on provided whitelist...')
    if args.whitelist.endswith('.gz'):
      with gzip.open(args.whitelist, 'rt') as f:
        whitelist = f.read().splitlines()
    else:
      with open(args.whitelist, 'r') as f:
        whitelist = f.read().splitlines()
    umi_obs = filter_cell_barcodes(umi_obs, whitelist)

  # downsample total genic reads if specified
  if args.sample:
    if args.sample <= umi_obs.Num_Obs.sum():
      eprint('Downsampling to ' + str(args.sample) + ' reads...')
      umi_obs = sample_reads(umi_obs, n = args.sample, seed = args.seed)
    else:
      eprint('Desired sampling size larger than number of input reads! Exiting.')
      quit()

  # calculate tpt
  eprint('Calculating TPT...')
  umi_tpt = calculate_tpt(umi_obs)

  # compute tpt histogram
  tpt_hist = compute_tpt_histogram(umi_tpt)

  # write histogram to text file
  hist_file = os.path.splitext(args.outputfile)[0] + '_tpt_histogram.txt'
  tpt_hist.to_csv(hist_file, sep = '\t', index = False)

  # filter transcripts based on minimum tbt
  eprint('Filtering for chimeric reads (min TPT = ' + str(args.tpt_threshold) + ')...')
  umi_filt = filter_tpt(umi_tpt, tpt_threshold = args.tpt_threshold)

  # calculate dge summary statistics
  dge_stats = dge_summary(umi_filt)

  # write dge summary to file
  dge_stats_file = os.path.splitext(args.outputfile)[0] + '_summary.txt'
  dge_stats.to_csv(dge_stats_file, sep = '\t', index = False)

  # create expression matrix and save to tab deliminted .txt file
  dge = create_dge_matrix(umi_filt)
  eprint('Writing DGE matrix to file...')
  dge.to_csv(args.outputfile, sep = '\t', index = True)

  eprint('Done!')

Python Pandas numpy From line 9 of processing/extract_dge.py

library("optparse")

# create arguments list
option_list = list(
  make_option(c("-i", "--infile"), type = "character", default = NULL,
              help = "Path to file containing digital expression (DGE) input data.",
              metavar = "character"),
  make_option(c("-o", "--outfile"), type = "character", default = NULL,
              help = "Path to output file for perturbation status matrix", metavar = "character"),
  make_option(c("-v", "--vector_pattern"), type = "character", default = NULL,
              help = paste(
                     "Identifier common to all CROP-seq vectors, for instance a prefix or suffix",
                     "to the unique vector identifier. For instance in ID 'CROPseq_dCas9_DS_01'",
                     "the common identifier is 'CROPseq_dCas9_DS_' while '01' is the unique vector",
                     "identifier. This parameter is needed to extract vector expression data from",
                     "DGE data.", sep = " \n\t\t"
                     ),
              metavar = "character"),
  make_option(c("-m", "--min_txs"), type = "integer", default = NULL,
              help = "Minimum number (integer) of transcripts per vector to define perturbation",
              metavar = "integer"),
  make_option(c("-t", "--trim_id"), action = "store_true", default = FALSE,
              help = "Triggers removal of the vector pattern output vector IDs.")
)

# parse arguments
opt_parser = OptionParser(option_list = option_list)
opt = parse_args(opt_parser)

# function to check for required arguments
check_required_args <- function(arg, opt, opt_parser) {

  if (is.null(opt[[arg]])) {

    print_help(opt_parser)
    stop(arg, " argument is required!", call. = FALSE)

  }

}

# check that all required parameters are provided
required_args <- c("infile", "outfile", "vector_pattern", "min_txs")
for (i in required_args) {

  check_required_args(i, opt = opt, opt_parser = opt_parser)

}

# define main function -----------------------------------------------------------------------------

# infer perturbation status of each cell based on simple transcript cutoff
infer_pert <- function(dge, vector_pattern, min_txs) {

  # filter for vector expression data
  vctrs <- grep(dge$GENE, pattern = vector_pattern)
  vctr_dge <- dge[vctrs, ]

  # move gene name to rownames
  rownames(vctr_dge) <- vctr_dge$GENE
  vctr_dge <- vctr_dge[, which(colnames(vctr_dge) != "GENE")]

  # infer perturbation status of each cell
  pert <- as.data.frame((vctr_dge >= min_txs) * 1)

  # add column for vector id
  pert <- cbind.data.frame("VECTOR" = rownames(pert), pert, stringsAsFactors = FALSE)
  rownames(pert) <- NULL

  return(pert)

}

# infer perturbation status of specified input file ------------------------------------------------

message("Loading DGE data...")
dge <- read.table(opt$infile, header = TRUE, stringsAsFactors = FALSE)

message("Inferring perturbation status...")
pert_status <- infer_pert(dge, vector_pattern = opt$vector_pattern, min_txs = opt$min_txs)

# remove vector pattern (common vector identifier) from output if specified
if (opt$trim_id == TRUE) {

  message("Trimming vector pattern from vector IDs...")
  pert_status$VECTOR <- sub(opt$vector_pattern, "", pert_status$VECTOR)

}

message("Writing to output file...")
write.table(pert_status, file = opt$outfile, quote = FALSE, sep = "\t", row.names = FALSE)

message("Done!")