Basic RNA-seq Analysis Workflow with HISAT2, featureCounts, and DESeq2

public public 8mo ago 0 bookmarks

Introduction

This repository contains a Snakemake workflow for performing a basic RNA-seq analysis pipeline using HISAT2, featureCounts, and DESeq2.

This workflow performs the following steps:

  1. Optional: Retrieval of publ

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
__author__ = "Jan Forster"
__copyright__ = "Copyright 2021, Jan Forster"
__email__ = "[email protected]"
__license__ = "MIT"


import os

from snakemake.shell import shell

in_file = snakemake.input[0]
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

if in_file.endswith(".sam") and ("-S" not in extra or "--sam-input" not in extra):
    extra += " --sam-input"

shell(
    "sambamba view {extra} -t {snakemake.threads} "
    "{snakemake.input[0]} > {snakemake.output[0]} "
    "{log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
__author__ = "Antonie Vietor"
__copyright__ = "Copyright 2020, Antonie Vietor"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell("samtools idxstats {snakemake.input.bam} > {snakemake.output[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {snakemake.params} {snakemake.input[0]} {snakemake.output[0]} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools merge {threads} {snakemake.params} "
    "{snakemake.output[0]} {snakemake.input} "
    "{log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

out_name, out_ext = os.path.splitext(snakemake.output[0])

tmp_dir = snakemake.params.get("tmp_dir", "")
if tmp_dir:
    prefix = os.path.join(tmp_dir, os.path.basename(out_name))
else:
    prefix = out_name

# Samtools takes additional threads through its option -@
# One thread for samtools
# Other threads are *additional* threads passed to the argument -@
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools sort {extra} {threads} -o {snakemake.output[0]} "
    "-T {prefix} {snakemake.input[0]} "
    "{log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
__author__ = "Wibowo Arindrarto"
__copyright__ = "Copyright 2016, Wibowo Arindrarto"
__email__ = "[email protected]"
__license__ = "BSD"


from snakemake.shell import shell

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")
# Run log
log = snakemake.log_fmt_shell()

# Input file wrangling
reads = snakemake.input.get("reads")
if isinstance(reads, str):
    input_flags = "-U {0}".format(reads)
elif len(reads) == 1:
    input_flags = "-U {0}".format(reads[0])
elif len(reads) == 2:
    input_flags = "-1 {0} -2 {1}".format(*reads)
else:
    raise RuntimeError(
        "Reads parameter must contain at least 1 and at most 2" " input files."
    )

# Executed shell command
shell(
    "(hisat2 {extra} "
    "--threads {snakemake.threads} "
    " -x {snakemake.params.idx} {input_flags} "
    " | samtools view -Sbh -o {snakemake.output[0]} -) "
    " {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "[email protected]"
__license__ = "MIT"

import os
from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTA files
fasta = snakemake.input.get("fasta")
assert fasta is not None, "input-> a FASTA-file or a sequence is required"
input_seq = ""
if not "." in fasta:
    input_seq += "-c "
input_seq += ",".join(fasta) if isinstance(fasta, list) else fasta

hisat_dir = snakemake.params.get("prefix", "")
if hisat_dir:
    os.makedirs(hisat_dir)

shell(
    "hisat2-build {extra} "
    "-p {snakemake.threads} "
    "{input_seq} "
    "{snakemake.params.prefix} "
    "{log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
__author__ = "Antonie Vietor"
__copyright__ = "Copyright 2020, Antonie Vietor"
__email__ = "[email protected]"
__license__ = "MIT"

import tempfile
from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")

# optional input files and directories
fasta = snakemake.input.get("fasta", "")
chr_names = snakemake.input.get("chr_names", "")
r_path = snakemake.params.get("r_path", "")

if fasta:
    extra += " -G {}".format(fasta)
if chr_names:
    extra += " -A {}".format(chr_names)
if r_path:
    extra += " --Rpath {}".format(r_path)

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "featureCounts"
        " -T {snakemake.threads}"
        " -a {snakemake.input.annotation}"
        " {extra}"
        " --tmpDir {tmpdir}"
        " -o {snakemake.output[0]}"
        " {snakemake.input.samples}"
        " {log}"
    )
 9
10
script:
		"../scripts/fasterq-dump.py"
21
22
script:
		"../scripts/fasterq-dump.py"
33
34
shell:
    "cat {input} > {output} 2> {log}"
48
49
wrapper:
	"v1.3.2/bio/hisat2/align"
62
63
wrapper:
    "v1.1.0/bio/samtools/sort"
76
77
wrapper:
    "v1.1.0/bio/samtools/index"
12
13
shell:
	"bamCoverage --bam {input.bam} -o {output} -p {threads} {params.extra}"
24
25
wrapper:
	"v1.1.0/bio/samtools/merge"		
38
39
wrapper:
    "v1.1.0/bio/samtools/index"
52
53
shell:
	"bamCoverage --bam {input.bam} -o {output} -p {threads} {params.extra}"
63
64
script:
	"../scripts/zscore_normalize_bw.R"
73
74
script:
	"../scripts/zscore_normalize_bw.R"
85
86
script:
	"../scripts/compute_scaling_factors.R"
97
98
script:
	"../scripts/spikeIn_normalize_bw.R"
108
109
script:
	"../scripts/spikeIn_normalize_bw.R"
18
19
wrapper:
	"v1.3.2/bio/subread/featurecounts"
29
30
script:
	"../scripts/compute_RPKM.R"
46
47
script:
	"../scripts/DEseq2.R"
63
64
script:
	"../scripts/DEseq2_results.R"
 9
10
wrapper:
	"v1.1.0/bio/samtools/idxstats" 
23
24
wrapper:
	"v1.1.0/bio/sambamba/view"
37
38
wrapper:
	"v1.1.0/bio/samtools/index"
48
49
wrapper:
	"v1.1.0/bio/samtools/idxstats"
61
62
shell:
	"samtools view -bh -L {input.keep_chroms} --output-fmt BAM -o {output} {input.bam} 2>> {log}"
74
75
wrapper:
	"v1.1.0/bio/sambamba/view"
89
90
wrapper:
    "v1.1.0/bio/samtools/index"
100
101
wrapper:
	"v1.1.0/bio/samtools/idxstats" 
SnakeMake From line 100 of rules/filter.smk
12
13
14
15
16
shell:
	"""
	curl {params.link} > {output}.gz 2> {log}
	unpigz {output}.gz 2>> {log}
	"""
SnakeMake From line 12 of rules/ref.smk
28
29
shell:
	"curl {params.link} > {output} 2> {log}"
SnakeMake From line 28 of rules/ref.smk
42
43
44
45
46
shell:
	"""
	curl {params.link} > {output}.gz 2> {log}
	unpigz {output}.gz 2>> {log}
	"""
SnakeMake From line 42 of rules/ref.smk
59
60
61
62
63
64
shell:
	"""
	seqkit replace -p "(.+)" -r "spikeIn_\$1" -o resources/tmp_spikeIn.fasta {input.spikeIn} 2> {log}
	cat {input.ref} resources/tmp_spikeIn.fasta > {output} 2>> {log}
	rm resources/tmp_spikeIn.fasta
	"""
74
75
shell:
	"mv {input} {output} 2> {log}"
SnakeMake From line 74 of rules/ref.smk
90
91
92
93
shell:
	"seqkit grep -f {input.keep_chroms} {input.genome}"
	" | seqkit fx2tab -nil"
	" |  awk -v OFS='\t' '{{print $1, 1, $2}}' > {output}"
106
107
wrapper:
	"v1.3.2/bio/hisat2/index"
SnakeMake From line 106 of rules/ref.smk
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
library(GenomicRanges)
library(tidyverse)

# function definitions =========================================================
# define function to compute RPKM
rpkm <- function(count_table, widths) {
  require(tidyverse)

  row_names <- tibble(id = rownames(count_table))


  # calculate rpkm
  kb_widths <- widths / 10^3

  column_rpkm <- function(x, widths = kb_widths) {
    cr <- (x / widths) / (sum(x) / 10^6 )
    return(cr)
  }

  all_rpkm <- count_table |>
    map(column_rpkm) |>
    as_tibble() |>
    bind_cols(row_names) |>
    column_to_rownames(var = "id")

  return(all_rpkm)

}

# read in count data ===========================================================
counts <- read_tsv(snakemake@input[[1]], comment = "#") |> 
  select(-c(2:5)) |> 
  column_to_rownames(var = "Geneid")
colnames(counts) <- gsub(".bam", "", basename(colnames(counts)))
# perform RPKM normalization ===================================================
rpkm_table <- rpkm(count_table = select(counts, -c("Length")), widths = counts$Length)

# add gene names to RPKM table =================================================
gene_annotation <- rtracklayer::import(snakemake@input[["annotation"]]) |> 
  mcols() |>
  as.data.frame() |>
  filter(type == "gene") |>
  dplyr::select(gene_id, gene_symbol)

rpkm_table <- rpkm_table |> 
  rownames_to_column(var = "gene_id") |> 
  left_join(gene_annotation, by = "gene_id") |> 
  select(gene_id, gene_symbol, everything())

# write output table ===========================================================
rpkm_table |> 
  write_tsv(snakemake@output[[1]])
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
library(tidyverse)

# import data ------------------------------------------------------------------
# read in all relevant stat files
stat_files <- snakemake@input[["mapping_stats"]]
names(stat_files) <- stat_files

mapping_stats <- stat_files %>%
  map_dfr(read_tsv, .id = "source", col_names = c("chromosome", "chromosome_size", "mapped_reads", "unmapped_reads")) %>%
  mutate(sample_name = gsub("_unireads.idxstats", "", basename(source)))


# add column marking reference or spike-in chromosomes
mapping_stats <- mapping_stats %>%
  mutate(reference_genome = case_when(str_detect(chromosome, "spikeIn")  ~ "spikeIn",
                                      !str_detect(chromosome, "spikeIn")  ~ "reference")) 


# compute % reads mapping to reference and spike-in genomes
mapping_stats <- mapping_stats %>% 
  group_by(sample_name, reference_genome) %>% 
  summarise(total_reads = sum(mapped_reads)) %>%
  pivot_wider(names_from = reference_genome, values_from = total_reads) %>%
  mutate(total_mapped_reads = reference + spikeIn) %>%
  mutate(percent_reference = reference / total_mapped_reads * 100, percent_spikeIn = spikeIn / total_mapped_reads * 100)


# read in unit table
sample_table <- snakemake@config[["units"]] %>% 
  read_tsv() %>% 

  # remove redundant technical replicates from sample_table
  distinct(sample_name, .keep_all = TRUE)


# mark samples as inputs or IPs
sample_table <- sample_table %>% 
  mutate(chip_sample_type = case_when(
    (sample_name %in% .$input) ~ "input",
    !(sample_name %in% .$input) ~ "IP"
  )) %>% 
  mutate(has_input = !is.na(input))


# create new column indicating input/IP pairs
tmp <- sample_table %>% select(input, sample_name) %>% dplyr::rename(IP = sample_name, sample_name = input)
sample_table <- sample_table %>% 
  left_join(tmp, by = "sample_name") %>% 
  mutate(IP_group = case_when(
    is.na(IP) ~ sample_name,
    !is.na(IP) ~ IP
  ))




# prepare sample_table and mapping_stats tables for join
individual_mapping_stats <- mapping_stats %>% 
  select(sample_name, percent_reference, percent_spikeIn)

# join sample_table and mapping stats
individual_scaling_factors <- sample_table %>%
  left_join(individual_mapping_stats, by = "sample_name")


# compute normalization factors for individual samples -------------------------
sample_has_input <- individual_scaling_factors %>% 
  filter(chip_sample_type == "IP") %>% 
  select(has_input) %>% 
  pull()

if (all(sample_has_input)) {

  individual_scaling_factors <- individual_scaling_factors %>% 
    pivot_wider(names_from = chip_sample_type, values_from = percent_spikeIn, id_cols = IP_group) %>% 
    mutate(enrichment = IP / input) %>%
    mutate(scaling_factor = 1 / enrichment) %>%
    mutate(norm_scaling_factor = scaling_factor / max(scaling_factor))

} else {
  warning("not all IPs have corresponding input sample, calculating scaling factors based on %spikeIn for IP samples")
  individual_scaling_factors <- individual_scaling_factors %>% 
    pivot_wider(names_from = chip_sample_type, values_from = percent_spikeIn, id_cols = IP_group) %>% 
    mutate(scaling_factor = 1 / IP) %>%
    mutate(norm_scaling_factor = scaling_factor / max(scaling_factor))
}

individual_scaling_factors %>% 
  write_tsv(snakemake@output[[1]])

# prepare data for merged samples -----------------------------
merged_mapping_stats <- mapping_stats %>% 
  select(sample_name, reference, spikeIn)

# combine replicate reads
merged_mapping_stats <- sample_table %>% 
  left_join(merged_mapping_stats, by = "sample_name") %>% 
  group_by(sample_group) %>% 
  summarise(spikeIn = sum(spikeIn), reference = sum(reference))




# create new column indicating input/IP pairs
tmp <- sample_table %>% select(input, sample_group) %>% dplyr::rename(IP = sample_group, sample_name = input)
sample_table <- sample_table %>% 
  select(-IP, -IP_group) %>% 
  left_join(tmp, by = "sample_name") %>% 
  mutate(IP_group = case_when(
    is.na(IP) ~ sample_group,
    !is.na(IP) ~ IP
  ))



# join sample_table and mapping stats for merged reads
merged_scaling_factors <- merged_mapping_stats %>% 
  left_join(select(sample_table, sample_group, chip_sample_type,IP_group, has_input), by = "sample_group") %>% 
  distinct(.keep_all = TRUE) %>%
  mutate(total_mapped_reads = reference + spikeIn) %>%
  mutate(percent_reference = reference / total_mapped_reads * 100, percent_spikeIn = spikeIn / total_mapped_reads * 100)

# compute normalization factors for merged samples -------------------------
sample_has_input <- merged_scaling_factors %>% 
  filter(chip_sample_type == "IP") %>% 
  select(has_input) %>% 
  pull()
if (all(sample_has_input)) {

  merged_scaling_factors <- merged_scaling_factors %>% 
    pivot_wider(names_from = chip_sample_type, values_from = percent_spikeIn, id_cols = IP_group) %>% 
    mutate(enrichment = IP / input) %>%
    mutate(scaling_factor = 1 / enrichment) %>%
    mutate(norm_scaling_factor = scaling_factor / max(scaling_factor))

} else {
  warning("not all IPs have corresponding input sample, calculating scaling factors based on %spikeIn for IP samples")
  merged_scaling_factors <- merged_scaling_factors %>% 
    pivot_wider(names_from = chip_sample_type, values_from = percent_spikeIn, id_cols = IP_group) %>% 
    mutate(scaling_factor = 1 / IP) %>%
    mutate(norm_scaling_factor = scaling_factor / max(scaling_factor))
}

merged_scaling_factors %>% 
  write_tsv(snakemake@output[[2]])
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(tidyverse))

# import count table -----------------------------------------------------------
count_table <- read_tsv(snakemake@input[[1]], comment = "#") %>% 
  select(-c(2:6)) %>% 
  column_to_rownames(var = "Geneid")
colnames(count_table) <- gsub(".bam", "", basename(colnames(count_table)))

# create colData table ---------------------------------------------------------
sample_table <- read_tsv(snakemake@params[["samples"]]) %>% 
  filter(experiment == snakemake@wildcards[["experiment"]])

coldata <- tibble(sample_name = colnames(count_table)) %>% 
  left_join(sample_table, by = "sample_name")

# run DESeq2 -------------------------------------------------------------------
dds <- DESeqDataSetFromMatrix(as.matrix(count_table), coldata, design = as.formula(snakemake@params[["model"]]))

# filter out low count genes
keep <- rowSums(counts(dds)) >= snakemake@params[["count_threshold"]]
dds2 <- dds[keep,]

# test for differential gene expression
dds2 <- DESeq(dds2)

# write dds2 object to Rdata file ----------------------------------------------
saveRDS(dds2, file = snakemake@output[[1]])
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(rtracklayer))

# import dds object ------------------------------------------------------------
## import dds2 object
dds <- readRDS(snakemake@input[["dds"]])

# get DEseq results for all contrasts ------------------------------------------
contrast <- c("condition", snakemake@params[["contrast"]])
results <- results(dds, contrast = contrast, alpha = snakemake@params[["padj_cutoff"]]) %>% 
  as.data.frame() %>% 
  arrange(padj) %>% 
  rownames_to_column(var = "gene_id")

# add additional information to results table ----------------------------------
# #read in GTF file
gtf <- rtracklayer::import(snakemake@input[["annotation"]]) %>% 
    mcols() %>%
    as.data.frame() %>%
    filter(type == "gene") %>%
    dplyr::select(gene_id, gene_symbol)

# add gene symbol to results
out_table <- dplyr::left_join(results, gtf, by = "gene_id")

## add column indicating if gene is differentially expressed with padj < 0.05 and FC > 2
out_table <- out_table %>%
  dplyr::mutate(is_diff = (padj < snakemake@params[["padj_cutoff"]] & (abs(log2FoldChange) > snakemake@params[["FC_cutoff"]]))) %>%
  replace_na(list(is_diff = FALSE))

# write output file ------------------------------------------------------------
write_tsv(out_table, snakemake@output[[1]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
__author__ = "Johannes Köster, Derek Croote"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import os
import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import get_mem


log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")


# Parse memory
mem_mb = get_mem(snakemake, "MiB")


# Outdir
outdir = os.path.dirname(snakemake.output[0])
if outdir:
    outdir = f"--outdir {outdir}"


# Output compression
compress = ""
mem = f"-m{mem_mb}" if mem_mb else ""

for output in snakemake.output:
    out_name, out_ext = os.path.splitext(output)
    if out_ext == ".gz":
        compress += f"pigz -p {snakemake.threads} {out_name}; "
    elif out_ext == ".bz2":
        compress += f"pbzip2 -p{snakemake.threads} {mem} {out_name}; "


with tempfile.TemporaryDirectory() as tmpdir:
    mem = f"--mem {mem_mb}M" if mem_mb else ""

    shell(
        "(fasterq-dump --temp {tmpdir} --threads {snakemake.threads} {mem} "
        "{extra} {outdir} {snakemake.wildcards.accession}; "
        "{compress}"
        ") {log}"
    )
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(tidyverse))

# import data --------------------------------------------------------------------------------------------------
# get filename of bigwig that will be used for normalization
bw <- snakemake@input[["bw"]]

# read in normalization factors for merged replicates
scaling_factors <- read_tsv(snakemake@input[["scaling_factors"]])

# get sample name to use for finding the correct scaling factor
file_bn <- gsub(".bw", "", basename(bw))

# rescale bigwig file based on spike-in normalization factor ---------------------------------------------------

# get scaling factor
scaling_factor <- scaling_factors %>%
  filter(IP_group == file_bn) %>%
  select(norm_scaling_factor) %>%
  pull()


# normalize bigwig
scaled.gr <- import(bw) %>%
  as.data.frame() %>%
  mutate(score = score * scaling_factor) %>%
  makeGRangesFromDataFrame(keep.extra.columns = TRUE)

# set seqinfo of normalized Granges object
seqlevels(scaled.gr) <- seqlevels(BigWigFile(bw))
seqinfo(scaled.gr) <- seqinfo(BigWigFile(bw))


# export normalized bigwig -----------------------------------------------------
export(scaled.gr, snakemake@output[[1]], format = "bw")
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(GenomicRanges))


zscore_bw <- function(bw) {
  require(tidyverse)
  require(rtracklayer)
  require(GenomicRanges)

  # import bigwig file to Granges
  if (typeof(bw) == "character") {
    message("reading bigwig file")
    bw <- import(bw)
  }

  # if using a spike-in, filter the seqlevels to only the reference genome
  if (snakemake@config[["use_spikeIn"]]) {
    message("removing spikeIn chromosomes")
    ref_chroms <- seqlevels(bw)[!grepl("spikeIn_", seqlevels(bw))]
    bw <- keepSeqlevels(bw, ref_chroms, pruning.mode = "coarse")
  }

  if (snakemake@config[["filter_chroms"]]) {
    message("filtering reference chromosomes")
    keep_chroms <- read_tsv(snakemake@config[["keep_chroms"]], col_names = c("chromosome"))
    ref_chroms <- seqlevels(bw)[seqlevels(bw) %in% keep_chroms$chromosome]
    bw <- keepSeqlevels(bw, ref_chroms, pruning.mode = "coarse")
  }


  # for large regions with the same score, expand into equal sized bins
  message("binning genome")
  min_binsize <- min(width(bw))
  all_bins <- tileGenome(seqinfo(bw), tilewidth=min_binsize,cut.last.tile.in.chrom=TRUE)

  message("getting scores for all bins")
  # add the coverage/score for both input and IP
  all_bins <- subsetByOverlaps(all_bins, bw)
  overlaps <- findOverlaps(all_bins, bw)
  all_bins$score[overlaps@from] <- bw$score[overlaps@to]

  # perform z-score normalization
  message("performing z-score normalization")
  all_bins$zscore <- scale(all_bins$score)[,1]
  all_bins$score <- NULL
  all_bins$score <- all_bins$zscore
  all_bins$zscore <- NULL
  # collapse adjacent bins with same score
  collapsed <- unlist(GenomicRanges::reduce(split(all_bins, ~score)))
  collapsed$score <- as.numeric(names(collapsed))
  names(collapsed) <- NULL
  all_bins <- collapsed

  #set seqinfo for z-score normalized version
  seqinfo(all_bins) <- seqinfo(bw)

  return(all_bins)
}


# perform z-score normalization and write new bigwig files ---------------------
zscore.gr <- zscore_bw(snakemake@input[[1]])
export(zscore.gr, snakemake@output[[1]])
ShowHide 40 more snippets with no or duplicated tags.

Free

Created: 8mo ago
Updated: 8mo ago
Maitainers: public
URL: https://github.com/tjgibson/NGS-workflow-RNAseq
Name: ngs-workflow-rnaseq
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...