This pipeline contains the data processing steps used for Thuy Ngo's 2019 cell-free RNA manuscript

public public 1yr ago 0 bookmarks

cfRNA-seq Pipeline

Pipeline to process raw fastq files for Thuy Ngo's manuscript: "Plasma cell-free RNA profiling enables pan-cancer detection and distinguishes cancer from pre-malignant conditions"

This is a package of Python and R scri

Code Snippets

 9
10
11
12
run:
    sickle = config["sickle_tool"]

    shell("{sickle} pe -f {input.fwd} -r {input.rev}  -l 40 -q 20 -t sanger  -o {output.fwd} -p {output.rev} -s {output.single} &> {input.fwd}.log")
23
24
shell:
    """fastqc --outdir samples/fastqc/{wildcards.sample} --extract  -f fastq {input.fwd} {input.rev}"""
41
42
shell:
    """fastq_screen --aligner bowtie2 --conf {params.conf} --outdir samples/fastqscreen/{wildcards.sample} {input.fwd} {input.rev}"""
54
55
56
57
58
59
60
61
62
63
64
65
66
67
run:
     STAR=config["star_tool"],
     pathToGenomeIndex = config["star_index"]

     shell("""
            {STAR} --runThreadN {threads} --runMode alignReads --genomeDir {pathToGenomeIndex} \
            --readFilesIn {input.fwd} {input.rev} \
            --outFileNamePrefix samples/star/{wildcards.sample}_bam/ \
            --sjdbGTFfile {params.gtf} --quantMode GeneCounts \
            --sjdbGTFtagExonParentGene gene_name \
            --outSAMtype BAM SortedByCoordinate \
            #--readFilesCommand zcat \
            --twopassMode Basic
            """)
76
77
shell:
    """samtools index {input} {output}"""
84
85
script:
    "../scripts/compile_star_log.py"
93
94
95
96
97
98
99
run:
    picard_insert_size=config["insert_size_tool"]

    shell("java -jar {picard_insert_size} \
           I={input} \
           O={output.metrics} \
           H={output.hist}")
109
110
111
112
113
114
115
116
run:
  picard=config["picard_tool"]

  shell("java -Xmx3g -jar {picard} \
  INPUT={input} \
  OUTPUT={output} \
  METRICS_FILE=samples/genecounts_rmdp/{wildcards.sample}_bam/{wildcards.sample}.rmd.metrics.text \
  REMOVE_DUPLICATES=true")
129
130
shell:
  """samtools sort -O bam -n {input} -o {output}"""
142
143
wrapper:
    "0.17.0/bio/samtools/stats"
159
160
161
162
163
164
165
166
shell:
    """htseq-count \
            -f bam \
            -r name \
            -s reverse \
            -m intersection-strict \
            {input} \
            {params.gtf} > {output}"""
178
179
180
181
182
183
184
185
186
187
shell:
    """htseq-count \
           -f bam \
           -r name \
           -s reverse \
           -m intersection-strict \
           -i exon_id \
           --additional-attr=gene_name \
           {input} \
           {params.exon_gtf} > {output}"""
195
196
script:
    "../scripts/compile_counts_table.py"
204
205
script:
    "../scripts/compile_counts_table_w_stats.py"
213
214
script:
    "../scripts/compile_exon_counts.R"
17
18
script:
    "../scripts/deseq2-init.R"
42
43
script:
    "../scripts/deseq2_pairwise.R"
66
67
script:
    "../scripts/deseq2_group.R"
89
90
script:
    "../scripts/QC.R"
103
104
script:
    "../scripts/qplot.R"
SnakeMake From line 103 of rules/deseq.smk
119
120
script:
    "../scripts/density_plot.R"
SnakeMake From line 119 of rules/deseq.smk
137
138
script:
    "../scripts/runGOforDESeq2.R"
SnakeMake From line 137 of rules/deseq.smk
152
153
script:
    "../scripts/RNAseq_makeVolcano.R"
SnakeMake From line 152 of rules/deseq.smk
170
171
script:
    "../scripts/permutation_test.R"
SnakeMake From line 170 of rules/deseq.smk
185
186
script:
    "../scripts/run_glimma.R"
SnakeMake From line 185 of rules/deseq.smk
198
199
script:
    "../scripts/run_glimma_mds.R"
SnakeMake From line 198 of rules/deseq.smk
13
14
shell:
    "insertion_profile.py -s '{params.seq_layout}' -i {input} -o rseqc/insertion_profile/{wildcards.sample}/{wildcards.sample}"
28
29
shell:
    "inner_distance.py -i {input} -o rseqc/inner_distance/{wildcards.sample}/{wildcards.sample} -r {params.bed}"
44
45
shell:
    "clipping_profile.py -i {input} -s '{params.seq_layout}' -o rseqc/clipping_profile/{wildcards.sample}/{wildcards.sample}"
57
58
shell:
   "read_distribution.py -i {input} -r {params.bed} > {output}"
66
67
script:
    "../scripts/get_rd.py"
79
80
shell:
    "read_GC.py -i {input} -o rseqc/read_GC/{wildcards.sample}/{wildcards.sample}"
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
import pandas as pd


"""Accepts a HTSeq output directory and compiles {sample}_htseq_gene_count.txt into a joined tab sep file
Args:
    snakemake.input (list): list of globbed wildcards HTSeq 
    snakemake.output[0] (str): data/{params.project_id}_counts.txt

Returns:
    Compiled STAR gene counts table as tab delimited file.
"""

tables = [pd.read_csv(fh, sep='\t', index_col=0, names=[fh.split('/')[-1].split('_')[0]]) for fh in snakemake.input]
joined_table = pd.concat(tables, axis=1)
filtered_joined = joined_table.iloc[:-5, :]
filtered_joined_sorted = filtered_joined.reindex(sorted(filtered_joined.columns), axis = 1)
filtered_joined_sorted.to_csv(snakemake.output[0], sep='\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
import pandas as pd


"""Accepts a HTSeq output directory and compiles {sample}_htseq_gene_count.txt into a joined tab sep file
Args:
    snakemake.input (list): list of globbed wildcards HTSeq 
    snakemake.output[0] (str): data/{params.project_id}_counts.txt
Returns:
    Compiled STAR gene counts table as tab delimited file.
"""

tables = [pd.read_csv(fh, sep='\t', index_col=0, names=[fh.split('/')[-1].split('_')[0]]) for fh in snakemake.input]
joined_table = pd.concat(tables, axis=1)
joined_sorted = joined_table.reindex(sorted(joined_table.columns), axis = 1)
joined_sorted.to_csv(snakemake.output[0], sep='\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
files = snakemake@input

#make a empty dataframe, ncol= number of samples +1, nrow= number of rows in gene count file
all = data.frame(matrix(ncol = length(files) + 2, nrow=nrow(read.table(files[[1]], header = F, sep = "\t"))))

#get row names
samp1=read.table(file = files[[1]], header = F, sep = "\t")
genelist=samp1[,1:2]
genelist=as.data.frame(genelist)
colnames(genelist)=c("ensembl_gene_id","gene_id")
head(genelist)

#combine counts from files
for(i in 1:length(files)){
  counts = read.table(file = files[[i]], header = F, stringsAsFactors  = F, sep = "\t")
  print(files[[i]])
  samp=gsub("_htseq_exon_count.txt","",files[[i]])

  colnames(counts)=c("ensembl_gene_id","gene_id",samp)

  if(i==1){
    all=merge(genelist,counts)  
  } 
  else
  {all=merge(all,counts) }

}

write.table(all, file = snakemake@output[[1]], quote = F, sep = "\t", row.names = F)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
import pandas as pd


"""Function accepts a STAR output directory and compiles all sample information from Log.final.out
Args:
    snakemake.input (list): list of globbed wildcards STAR Log.final.out
    project_title (str): Project title for compiled STAR mapping statistics

Returns:
    Compiled STAR log.final.out as tab delimited file.
"""

tables = [pd.read_csv(fh, sep = '\t', index_col = 0, names = [fh.split('/')[-2]]) for fh in snakemake.input]
joined_table = pd.concat(tables, axis=1)
joined_table_sorted = joined_table.reindex(sorted(joined_table.columns), axis = 1)
joined_table_sorted.to_csv(snakemake.output[0], sep='\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
library(DESeq2)
library(ggplot2)
library(reshape2)
library(data.table)
library(plyr)
library(RColorBrewer)


rld <- snakemake@input[['rld']]
cat(sprintf(c('rld: ', rld, '\n')))


condition <- snakemake@params[['linear_model']]
cat(sprintf(c('condition: ', condition, '\n')))

project_id <- snakemake@params[['project_id']]

density_plot <- snakemake@output[['density']]
cat(sprintf(c('Density plot : ', density_plot, '\n')))


colors <- snakemake@params['colors']
discrete <- snakemake@params['discrete']

# function to grab the ggplot2 colours
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

rld = readRDS(rld)
normed_values = assay(rld)
normed_t = t(normed_values)
meta = colData(rld)

if(colors[[1]] !='NA' & discrete[[1]] =='NA'){
    if(brewer.pal.info[colors[[1]],]$maxcolors >= length(levels(meta[[condition]]))) {
        pal <- brewer.pal(length(levels(meta[[condition]])),name=colors[[1]])
    } 
} else if(discrete[[1]] != 'NA' & length(discrete)==length(levels(meta[[condition]]))){
        pal <- unlist(discrete)
} else {
        pal <- gg_color_hue(length(levels(meta[[condition]])))
}

joined_counts = cbind(meta[condition],normed_t)

x = as.data.table(joined_counts)
mm <- melt(x,id=condition)

mu <- ddply(mm, condition, summarise, grp.mean=mean(value))
pdf(density_plot)
p<-ggplot(mm, aes_string(x='value', color=condition)) +
  geom_density()+
  geom_vline(data=mu, aes_string(xintercept='grp.mean', color=condition),
             linetype="dashed") + xlab('regularized log expression') + 
  scale_color_manual(values=pal) +
  ggtitle(eval(project_id)) + theme(plot.title = element_text(hjust = 0.5))
p
dev.off()
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
library("DESeq2")
library("ggplot2")
library("pheatmap")
library("dplyr")
library("vsn")
library("RColorBrewer")
library("genefilter")

cat(sprintf(c('Working directory',getwd())))

cat(sprintf('Setting parameters'))

pca_plot <- snakemake@output[['pca']]
cat(sprintf(c('PCA plot: ',pca_plot)))

labels <- snakemake@params[['pca_labels']]
cat(sprintf(c('PCA Labels: ',labels)))

sd_mean_plot <- snakemake@output[['sd_mean_plot']]
cat(sprintf(c('SD Mean plot: ',sd_mean_plot,'\n')))

distance_plot <- snakemake@output[['distance_plot']]
cat(sprintf(c('Distance plot: ',distance_plot,'\n')))

heatmap_plot <- snakemake@output[['heatmap_plot']]
cat(sprintf(c('Heatmap Plot: ', heatmap_plot, '\n')))

rds_out <- snakemake@output[['rds']]
cat(sprintf(c('RDS Output: ', rds_out, '\n')))

rld_out <- snakemake@output[['rld_out']]
cat(sprintf(c('RLD Output: ', rld_out, '\n')))

counts <- snakemake@input[['counts']]
cat(sprintf(c('Counts table: ', counts, '\n')))

metadata <- snakemake@params[['samples']]
cat(sprintf(c('Metadata: ', metadata, '\n')))

sampleID <- snakemake@params[['sample_id']]
cat(sprintf(c('Sample ID: ', sampleID, '\n')))

Type <- snakemake@params[['linear_model']]
cat(sprintf(c('Linear Model: ', Type, '\n')))

group <- snakemake@params[['LRT']]
cat(sprintf(c('Subsetted group: ', group, '\n')))

plot_cols <- snakemake@config[['meta_columns_to_plot']]
subset_cols = names(plot_cols)

# color palette
colors <- snakemake@params[['colors']]
discrete <- snakemake@params[['discrete']]

# function to grab the ggplot2 colours
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

Dir <- "results/diffexp/group/"

md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
md <- md[order(md[sampleID]),]

# Read in counts table
cts <- read.table(counts, header=TRUE, row.names=1, sep="\t", check.names=F)
cts <- cts[,order(colnames(cts))]

# Put sample IDs as rownames of metadata
rownames(md) <- md[[sampleID]]
md[[sampleID]] <- NULL

# Ensure that we subset md to have exactly the same samples as in the counts table
md <- md[colnames(cts),]
dim(md)

# Check
stopifnot(rownames(md)==colnames(cts))

# Define colours based on number of Conditions
if(colors[[1]] !='NA' & discrete[[1]] =='NA'){
    if (brewer.pal.info[colors[[1]],]$maxcolors >= length(unique(md[[Type]]))) {
        pal <- brewer.pal(length(unique(md[[Type]])),name=colors[[1]])
    } 
} else if(discrete[[1]] != 'NA' & length(discrete)==length(unique(md[[Type]]))){
        pal <- unlist(discrete)
} else {
        pal <- gg_color_hue(length(unique(md[[Type]])))
}

# Create dds object from counts data and correct columns
dds <- DESeqDataSetFromMatrix(countData=cts,
                              colData=md,
                              design= as.formula(paste('~',Type)))

# Remove uninformative columns
dds <- dds[ rowSums(counts(dds)) >= 1, ]

# Likelihood Ratio test to look at differential expression across ALL types, and not just pairs of types (contrast)
dds.lrt <- DESeq(dds, test="LRT", reduced=~1)
res.lrt <- results(dds.lrt, cooksCutoff = Inf, independentFiltering=FALSE)
head(res.lrt)

# Obtain normalized counts
rld <- rlog(dds.lrt, blind=FALSE)

# Pairwise PCA Plot
pcaData <- plotPCA(rld, intgroup=labels[[1]], returnData=TRUE)

pdf(pca_plot)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]])) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() +
  scale_colour_manual(values=pal)
dev.off()

# Pairwise PCA Plot with more than one PCA parameter
if (length(labels)>1) {
  pca_plot2 <- sub("$","twoDimensional_pca_plot.pdf", Dir)
  pcaData <- plotPCA(rld, intgroup=c(labels[[1]], labels[[2]]), returnData=TRUE)
  pdf(pca_plot2, 5, 5)
  percentVar <- round(100 * attr(pcaData, "percentVar"))
  ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]], shape=labels[[2]])) +
    geom_point(size=3) +
    xlab(paste0("PC1: ",percentVar[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar[2],"% variance")) +
    coord_fixed() +
    scale_colour_manual(values=pal)
  dev.off()
}

# SD mean plot
pdf(sd_mean_plot)
meanSdPlot(assay(rld))
dev.off()

# Heatmap of distances
pdf(distance_plot)
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)

rownames(sampleDistMatrix) <- colnames(rld)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix, fontsize=5, scale="row",
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)
dev.off()

# Heatmap across all samples
# List top 50 genes for group comparisons
topGenes <- head(order(res.lrt$padj), 50)

# Extract topGenes from rld object
plot <- assay(rld)[topGenes,] #for 2+ types

# Generate data frame with samples as the rownames and single colData as the first row
# Default when we subset creates an incompatible dataframe so this is a check
df <- as.data.frame(colData(rld))
if (length(subset_cols)==1) {
  annot <- as.data.frame(cbind(rownames(df), paste(df[[subset_cols[1]]])))
  names(annot) <- c("SampleID", subset_cols[1])
  rownames(annot) <- annot[[sampleID]]
  annot[[sampleID]] <- NULL
} else {
  annot <- df[,subset_cols]
}

filt <- plot[apply(plot, MARGIN = 1, FUN = function(x) sd(x) != 0),]

pdf(heatmap_plot)
pheatmap(filt, cluster_rows=T, scale="row", fontsize=6,fontsize_row=6,fontsize_col=6,show_rownames=T, cluster_cols=T, annotation_col=annot, labels_col=as.character(rownames(df)), main = paste("Heatmap of top 50 DE genes across all samples"))
dev.off()

saveRDS(dds, file=rds_out)
saveRDS(rld, file=rld_out)

group <- as.vector(group)

# If LRT group has been specified, run the analysis for that group
if (length(group)>0) {
  md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
  md <- md[order(md[sampleID]),]
  cts <- read.table(counts, header=TRUE, row.names=1, sep="\t")
  cts <- cts[,order(colnames(cts))]
  md <- md[md[[Type]] %in% group,]
  rownames(md) <- md[[sampleID]]
  md[[sampleID]] <- NULL
  keep <- colnames(cts)[colnames(cts) %in% rownames(md)]
  cts <- cts[, keep]
  dim(cts)
  md <- md[colnames(cts),]
  dim(md)

  dds <- DESeqDataSetFromMatrix(countData=cts,
                              colData=md,
                              design= as.formula(paste('~',Type)))
  dds <- dds[ rowSums(counts(dds)) >= 1, ]
  dds.lrt <- DESeq(dds, test="LRT", reduced=~1)
  res.lrt <- results(dds.lrt, cooksCutoff = Inf, independentFiltering=FALSE)
  rld <- rlog(dds.lrt, blind=FALSE)

  # Pairwise PCA Plot
  pdf(sub("$", "subsetted_pca_plot.pdf", Dir), 5, 5)
  plotPCA(rld, intgroup=labels[[1]])
  dev.off()
  # Pairwise PCA Plot with more than one PCA parameter
  if (length(labels)>1) {
    pcaData <- plotPCA(rld, intgroup=c(labels[[1]], labels[[2]]), returnData=TRUE)
    pdf(sub("$", "subsetted_twoDimensional_pca_plot.pdf", Dir), 5, 5)
    percentVar <- round(100 * attr(pcaData, "percentVar"))
    ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]], shape=labels[[2]])) +
      geom_point(size=3) +
      xlab(paste0("PC1: ",percentVar[1],"% variance")) +
      ylab(paste0("PC2: ",percentVar[2],"% variance")) +
      coord_fixed()
    dev.off()
  }

  # Heatmap
  topGenes <- head(order(res.lrt$padj), 50)
  # Extract topGenes from rld object
  plot <- assay(rld)[topGenes,] #for 2+ types

  df <- as.data.frame(colData(rld))
  if (length(subset_cols)==1) {
    annot <- as.data.frame(cbind(rownames(df), paste(df[[subset_cols[1]]])))
    names(annot) <- c("SampleID", subset_cols[1])
    rownames(annot) <- annot[[sampleID]]
    annot[[sampleID]] <- NULL
  } else {
    annot <- df[,subset_cols]
  }

  pdf(sub("$", "subsetted_heatmap.pdf", Dir), 5, 5)
  pheatmap(assay(rld)[topGenes,], cluster_rows=T, scale="row", fontsize=6,fontsize_row=6,fontsize_col=6,show_rownames=T, cluster_cols=T, annotation_col=annot, labels_col=as.character(rownames(df)), main = paste("Heatmap of top 50 DE genes across selected samples"))
  dev.off()
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
library("dplyr")
library("DESeq2")

counts = snakemake@input[['counts']]

metadata <- snakemake@params[['samples']]

sampleID <- snakemake@params[['sample_id']]

Type <- snakemake@params[['linear_model']]

contrast <- snakemake@params[['contrast']]

baseline <- contrast[[2]]

target <- contrast[[1]]

output = snakemake@output[['rds']]

rld_out = snakemake@output[['rld_out']]

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

# Read in metadata table and order according to sampleID
md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
md <- md[order(md[sampleID]),]

# Read in counts table
subdata <- read.table(counts, header=TRUE, row.names=1, sep="\t", check.names=FALSE)
subdata <- subdata[,order(colnames(subdata))]

# Extract only the Types that we want in further analysis & only the PP_ID and Status informative columns
md <- filter(md, !!as.name(Type) == baseline | !!as.name(Type) == target, !!as.name(sampleID) %in% colnames(subdata))

# Keep only the PP_IDs of the types we have chosen in the metadata table above
rownames(md) <- md[[sampleID]]
md[[sampleID]] <- NULL
keep <- colnames(subdata)[colnames(subdata) %in% rownames(md)]
subdata <- subdata[, keep]
dim(subdata)

# Check
stopifnot(rownames(md)==colnames(subdata))

# Obtain the number of genes that meet padj<0.01 for reference line in histogram
dds <- DESeqDataSetFromMatrix(countData=subdata,
                              colData=md,
                              design= as.formula(paste('~',Type)))

dds <- estimateSizeFactors(dds)

# Remove uninformative columns
dds <- dds[ rowSums(counts(dds)) > 1, ]

# Normalization and pre-processing
dds <- DESeq(dds, parallel=parallel)

saveRDS(dds, file=output)

# obtain normalized counts
rld <- rlog(dds, blind=FALSE)
saveRDS(rld, file=rld_out)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
library("DESeq2")
library("pheatmap")
library("ggplot2")
library("ggrepel")

print('Setting parameters')

rds = snakemake@input[['rds']]
cat(sprintf(c('RDS object: ',rds,'\n')))

rld = snakemake@input[['rld']]
cat(sprintf(c('RLD object: ',rld,'\n')))

Type = snakemake@params[['linear_model']]
cat(sprintf(c('Type: ',Type,'\n')))

sampleID = snakemake@params[['sample_id']]
cat(sprintf(c('Sample ID: ',sampleID,'\n')))

ma_plot = snakemake@output[['ma_plot']]
cat(sprintf(c('MA plot', ma_plot,'\n')))

out_table = snakemake@output[['table']]
cat(sprintf(c('Summary results table', out_table,'\n')))

out_gene_table = snakemake@output[['geneID_table']]

panel_ma = snakemake@output[['panel_ma']]
cat(sprintf(c('MA panel', panel_ma,'\n')))

heatmap_plot = snakemake@output[['heatmap_plot']]
cat(sprintf(c('Heatmap plot', heatmap_plot,'\n')))

var_heat = snakemake@output[['var_heat']]
cat(sprintf(c('Variance Heatmap plot', var_heat,'\n')))

pca_plot = snakemake@output[['pca_plot']]
cat(sprintf(c('PCA plot', pca_plot,'\n')))

labels <- snakemake@params[['pca_labels']]
cat(sprintf(c('PCA Labels: ',labels)))

cat(sprintf('Load dds DESeqTransform object'))
dds <- readRDS(rds)

cat(sprintf('Load rlog DESeqTransform object'))
rld <- readRDS(rld)

Dir <- "results/diffexp/pairwise/"

plot_cols <- snakemake@config[['meta_columns_to_plot']]
subset_cols = names(plot_cols)

contrast <- c(Type, snakemake@params[["contrast"]])
baseline <- contrast[[3]]
target <- contrast[[2]]

upCol = "#FF9999"
downCol = "#99CCFF"
ncCol = "#CCCCCC"
adjp <- 0.01
FC <- 2

ens2geneID <- snakemake@config[['ens2geneID']]

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

# Pairwise PCA Plot
pdf(pca_plot)
plotPCA(rld, intgroup=labels[[1]])
dev.off()

# Pairwise PCA Plot with more than one PCA parameter
if (length(labels)>1) {
  pca_plot2 <- sub("$",paste(contrast[2],"vs",contrast[3],"twoDimensional_pca_plot.pdf", sep = "-"), Dir)
  pcaData <- plotPCA(rld, intgroup=c(labels[[1]], labels[[2]]), returnData=TRUE)
  pdf(pca_plot2, 5, 5)
  percentVar <- round(100 * attr(pcaData, "percentVar"))
  ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]], shape=labels[[2]])) +
    geom_point(size=3) +
    xlab(paste0("PC1: ",percentVar[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar[2],"% variance")) +
    coord_fixed()
  dev.off()
}

res <- results(dds, contrast=contrast,  independentFiltering = FALSE, cooksCutoff = Inf)
# shrink fold changes for lowly expressed genes
res <- lfcShrink(dds, contrast=contrast, res=res)

# MA plot - calc norm values yourself to plot with ggplot
# MA plot is log2normalized counts (averaged across all samples) vs. log2FC

# extract normalized counts to calculate values for MA plot
norm_counts <- counts(dds, normalized=TRUE)

## select up regulated genes
forPlot <- as.data.frame(res)
forPlot$log2Norm <- log2(rowMeans(norm_counts))
forPlot$Gene <- rownames(forPlot)

## Replace ensemble id's with gene id's
gene_id = read.delim(ens2geneID)

## Remove unique identifier .xx from heatmap data
forPlot$Gene <- sub("\\.[0-9]*", "", forPlot$Gene)
iv <- match(forPlot$Gene, gene_id$ensembl_gene_id)
head(gene_id[iv,])

## assign the rownames of plot_LG to their external gene name using a variable, where we have indexed the 
## row number for all matches between these two dataframes
## Use paste to get rid of factors of this column, and just paste the value of the gene name
forPlot$Gene  <- paste(gene_id[iv, "external_gene_name"])

up <- forPlot$padj < adjp & forPlot$log2FoldChange > log2(FC)
sum(up)

## select down regulated genes
down <- forPlot$padj < adjp & forPlot$log2FoldChange < -log2(FC)
sum(down)

# Grab the top 5 up and down regulated genes to label in the volcano plot
if (sum(up)>5) {
  temp <- forPlot[up,]
  upGenesToLabel <- head(rownames(temp[order(-temp$log2FoldChange),], 5))
} else if (sum(up) %in% 1:5) {
  temp <- forPlot[up,]
  upGenesToLabel <- rownames(temp[order(-temp$log2FoldChange),])
}

if (sum(down)>5) {
  temp <- forPlot[down,]
  downGenesToLabel <- head(rownames(temp[order(temp$log2FoldChange),], 5))
} else if (sum(down) %in% 1:5) {
  temp <- forPlot[down,]
  downGenesToLabel <- rownames(temp[order(temp$log2FoldChange),])
}

forPlot$Expression <- ifelse(down, 'down',
                  ifelse(up, 'up','NS'))
forPlot$Expression <- factor(forPlot$Expression, levels=c("up","down","NS"))

# Assign colours to conditions
if (sum(up)==0 & sum(down)==0) {
  colours <- ncCol
} else if (sum(up)==0) {
  colours <- c(downCol, ncCol)
} else if (sum(down)==0) {
  colours <- c(upCol, ncCol)
} else {
  colours <- c(upCol, downCol, ncCol)
}

# Create vector for labelling the genes based on whether genes are DE or not
if (exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- c(downGenesToLabel, upGenesToLabel)
} else if (exists("downGenesToLabel") & !exists("upGenesToLabel")) {
  genesToLabel <- downGenesToLabel
} else if (!exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- upGenesToLabel
}

if (exists("genesToLabel")) {
  maPlot <- ggplot(forPlot, mapping=aes(x=log2Norm, y=log2FoldChange, colour=Expression)) +
    geom_point() +
    geom_hline(yintercept=c(-1,1), linetype="dashed", color="black") +
    geom_label_repel(aes(label=ifelse(Gene %in% genesToLabel, as.character(Gene),'')),box.padding=0.1, point.padding=0.5, segment.color="gray70", show.legend=FALSE) +
    scale_colour_manual(values=colours) +
    ggtitle(paste(baseline, "vs", target)) +
    xlab("log2(Normalized counts)") +
    ylab("log2(Fold Change)") +
    theme(plot.title = element_text(hjust=0.5))
} else {
  maPlot <- ggplot(forPlot, mapping=aes(x=log2Norm, y=log2FoldChange, colour=Expression)) +
    geom_point() +
    geom_hline(yintercept=c(-1,1), linetype="dashed", color="black") +
    scale_colour_manual(values=colours) +
    ggtitle(paste(baseline, "vs", target)) +
    xlab("log2(Normalized counts)") +
    ylab("log2(Fold Change)") +
    theme(plot.title = element_text(hjust=0.5))
}

# MA plot
pdf(ma_plot)
print({
  maPlot
})
dev.off()

# P-histogram
p_hist = snakemake@output[['p_hist']]
pdf(p_hist)
hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20, col = "grey50", border = "white", main='P values for genes with mean normalized count larger than 1',xlab='pvalue')
dev.off()

#panel ma plot
pdf(panel_ma)
par(mfrow=c(2,2),mar=c(2,2,1,1) +0.1)
ylim <- c(-2.5,2.5)
resGA <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="greaterAbs")
resLA <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="lessAbs")
resG <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="greater")
resL <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="less")
drawLines <- function() abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim); drawLines()
plotMA(resLA, ylim=ylim); drawLines()
plotMA(resG, ylim=ylim); drawLines()
plotMA(resL, ylim=ylim); drawLines()
mtext(resG@elementMetadata$description[[2]], outer=T, cex=.6,line=-1)
dev.off()

# Heatmap of top 50 genes
topGenes <- head(order(res$padj),50)

df <- as.data.frame(colData(rld))
if (length(subset_cols)==1) {
  annot <- as.data.frame(cbind(rownames(df), paste(df[[subset_cols[1]]])))
  names(annot) <- c("SampleID", subset_cols[1])
  rownames(annot) <- annot$SampleID
  annot$SampleID <- NULL
} else {
  annot <- df[,subset_cols]
}

forHeat <- assay(rld)[topGenes,]

## Remove unique identifier .xx from heatmap data
rownames(forHeat) <- sub("\\.[0-9]*", "", rownames(forHeat))
iv <- match(rownames(forHeat), gene_id$ensembl_gene_id)
head(gene_id[iv,])

## assign the rownames of plot_LG to their external gene name using a variable, where we have indexed the 
## row number for all matches between these two dataframes
## Use paste to get rid of factors of this column, and just paste the value of the gene name
rownames(forHeat) <- paste(gene_id[iv, "external_gene_name"])

pdf(heatmap_plot)
pheatmap(forHeat, cluster_rows=T, scale="row", fontsize=6,fontsize_row=6,fontsize_col=6,show_rownames=T, cluster_cols=T, annotation_col=annot, labels_col=as.character(rownames(df)), main = paste("Heatmap of top 50 DE genes:", contrast[2], "vs", contrast[3]))
dev.off()

# Variance Heatmap
pdf(var_heat)
topVarGenes <- head(order(rowVars(assay(rld)), decreasing = TRUE), 50)
mat  <- assay(rld)[ topVarGenes, ]
mat  <- mat - rowMeans(mat)

## Remove unique identifier .xx from heatmap data
rownames(mat) <- sub("\\.[0-9]*", "", rownames(mat))
iv <- match(rownames(mat), gene_id$ensembl_gene_id)
head(gene_id[iv,])
rownames(mat) <- paste(gene_id[iv, "external_gene_name"])

pheatmap(mat, scale="row", annotation_col = annot,fontsize=6, main = paste("Heatmap of top 50 most variable genes:", contrast[2], "vs", contrast[3]))
dev.off()

# sort by p-value
res <- res[order(res$padj),]
write.table(as.data.frame(res), file=out_table, quote=FALSE, sep='\t')

# Export table with the geneIDs
## Remove unique identifier .xx from results table
res$GeneID <- sub("\\.[0-9]*", "", rownames(res))
iv <- match(res$GeneID, gene_id$ensembl_gene_id)
head(gene_id[iv,])

## Paste external gene name of these ensembl IDs into this column instead
## Use paste to get rid of factors of this column, and just paste the value of the gene name
res$GeneID  <- paste(gene_id[iv, "external_gene_name"])

resGen <- cbind(res$GeneID, as.data.frame(res)[,1:6])
colnames(resGen)[1] <- "GeneID"

# Export to table
write.table(resGen, file=out_gene_table, row.names=FALSE, quote=FALSE, sep="\t")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd

files = snakemake.input
out = snakemake.output[0]

def split_exon_fraction(files, out):
    sample_list = []
    for file in files:
        sample = file.split('.')[0]
        name = sample.split('/')[3]
        results = {}
        equal = False
        for line in open(file):
            if '==' in line:
                equal = True

            if not equal:
                line = line.split()
                key ='_'.join(line[:2])
                results[key] = float(line[-1])

            if equal:
                if not '==' in line and not'Group' in line:
                    line = line.split()
                    key = line[0]
                    results[key] = float(line[2])
                else:
                    continue
        Total = results['CDS_Exons'] + results["5'UTR_Exons"] + results["3'UTR_Exons"] + results['Introns'] + results['TSS_up_1kb'] + results['TSS_up_5kb'] + results['TSS_up_10kb'] + results['TES_down_1kb'] + results['TES_down_5kb'] + results['TES_down_10kb']
        exon_fraction = (results['CDS_Exons'] + results["5'UTR_Exons"] + results["3'UTR_Exons"]) / Total
        intron_fraction = results['Introns'] / Total
        intergenic_fraction = (results['TSS_up_1kb'] + results['TSS_up_5kb'] + results['TSS_up_10kb'] + results['TES_down_1kb'] + results['TES_down_5kb'] + results['TES_down_10kb']) / Total
        sample_list.append([name, exon_fraction, intron_fraction, intergenic_fraction])

    frame = pd.DataFrame(sample_list)
    frame.columns = ['Sample','Exon','Intron','Intergenic']
    frame.to_csv(out,sep='\t',index=False)

split_exon_fraction(files, out)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
library(DESeq2)
library(dplyr)
library(ggplot2)

# Generate subdata 
counts <- snakemake@input[['counts']]

metadata <- snakemake@params[['samples']]

sampleID <- snakemake@params[['sample_id']]

hist <- snakemake@output[['histogram']]

numGenes <- snakemake@output[['numGenes']]

permList <- snakemake@output[['permList']]

Type <- snakemake@params[['linear_model']]

contrast <- snakemake@params[['contrast']]

baseline <- contrast[[2]]

target <- contrast[[1]]

md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
md <- md[order(md[[sampleID]]),]

# Read in counts table
subdata <- read.table(counts, header=TRUE, row.names=1, sep="\t")
subdata <- subdata[,order(colnames(subdata))]

# Extract only the Types that we want in further analysis & only the PP_ID and Status informative columns
md <- filter(md, !!as.name(Type) == baseline | !!as.name(Type) == target, !!as.name(sampleID) %in% colnames(subdata))

# Keep only the PP_IDs of the types we have chosen in the metadata table above
rownames(md) <- md[[sampleID]]
md[[sampleID]] <- NULL
keep <- colnames(subdata)[colnames(subdata) %in% rownames(md)]
subdata <- subdata[, keep]
dim(subdata)

# Check
stopifnot(rownames(md)==colnames(subdata))

# Get the number of Cancer samples and number of HD samples from md table
num1 = sum(md[[Type]] == baseline)
num2 = sum(md[[Type]] == target)

# Create a vector for both HD and Can, with a 1 for every HD and a 2 for every Cancer sample
One_vector = rep(c(1), times = num1)
Two_vector = rep(c(2), times = num2)

# Permutation
# Concatenate the HD and Can vector to create your "start group" vector
start_group = c(One_vector, Two_vector)
cutoff=0.01
number_of_diff_genes=c()
group_list = list()
number_of_try = 500

for (i in 1:number_of_try)
{
  print(i)
  group = data.frame(type=factor(sample(start_group)))

  dds = DESeqDataSetFromMatrix(countData = subdata,
                               colData = group,
                               design = ~ type)

  # Extract normalized counts
  dds = estimateSizeFactors(dds)

  # Remove genes with zero counts over all samples
  dds <- dds[ rowSums(counts(dds)) > 1, ]

  # Make sure of reference, set it by rlevel
  dds$type = relevel(dds$type, ref = 1)

  # The standard differential expression analysis steps are wrapped into a single function, DESeq
  dds = DESeq(dds)

  # Extract results
  res = results(dds, contrast = c("type", "1", "2"), independentFiltering = FALSE,cooksCutoff = Inf)

  tmp=sum(res$padj < cutoff, na.rm=TRUE)
  number_of_diff_genes = c(number_of_diff_genes,tmp)
  group_list[[i]] <- group

}

# Obtain the number of genes that meet padj<0.01 for reference line in histogram
dds <- DESeqDataSetFromMatrix(countData=subdata,
                              colData=md,
                              design= as.formula(paste('~',Type)))

dds <- estimateSizeFactors(dds)

# Remove uninformative columns
dds <- dds[ rowSums(counts(dds)) > 1, ]

# Normalization and pre-processing
dds <- DESeq(dds)

# Extract results and the number of significant genes with padj<0.01
results = results(dds, contrast = c(Type, target, baseline), independentFiltering = FALSE,cooksCutoff = Inf)
numSig <- sum(results$padj < cutoff, na.rm=TRUE)
number_of_diff_genes <- as.data.frame(number_of_diff_genes)
names(number_of_diff_genes) <- "NumDiffGenes"
number_of_diff_genes$Actual <- numSig

p <- ggplot(number_of_diff_genes, aes(x=NumDiffGenes)) +
  geom_histogram(bins=100) +
  geom_vline(data=number_of_diff_genes, mapping=aes(xintercept = numSig, color = "Correct Labels"), 
             linetype="longdash", size=0.6, show.legend = T) +
  scale_color_manual(values = "gray75", name = "Number of DE genes") +
  ggtitle(paste(number_of_try, "Random Permutations:", baseline, "vs", target)) +
  xlab("Number of significant genes") +
  theme(aspect.ratio=1,
        plot.title = element_text(hjust = 0.5),
        legend.title = element_text(size=10, hjust = 0.5))

pdf(hist)
print({
  p
})
dev.off()

df <- data.frame(stringsAsFactors = FALSE)

for (i in 1:number_of_try) {
  if (i==1) {
    df = group_list[[i]]
  }
  else {
    df = cbind(df, group_list[[i]])
  }
  colnames(df)[i] = paste("perm",i, sep = "_")
}

write.csv(number_of_diff_genes, numGenes)
write.csv(df, permList)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
library("DESeq2")
library("reshape2")
library("cowplot")
library("limma")
library("vsn")
library("genefilter")
library("ggplot2")
library("dplyr")
library("RColorBrewer")
library("pheatmap")
library("hexbin")

# output files
MDS_out <- snakemake@output[['mds_plot']]
MDS_table <- snakemake@output[['mds_table']]
heatmap_out <- snakemake@output[['heatmap_plot']]
sd_out <- snakemake@output[['sd_plot']]
normCounts_out <- snakemake@output[['rlogCounts_plot']]
normCounts_fac <- snakemake@output[['rlogCounts_fac_plot']]
rawCounts_out <- snakemake@output[['counts_plot']]
rawCounts_fac <- snakemake@output[['counts_fac_plot']]

# parameters
sampleID <- snakemake@params[['sample_id']]
Type = snakemake@params[['linear_model']]
plot_cols <- snakemake@config[['meta_columns_to_plot']]
subset_cols = names(plot_cols)

# color palette
colors <- snakemake@params[['colors']]
discrete <- snakemake@params[['discrete']]

# DESeq2 objects
rld <- snakemake@input[['rld']]
dds <- snakemake@input[['rds']]

rld <- readRDS(rld)
dds <- readRDS(dds)

# function to grab the ggplot2 colours
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

rawCounts <- counts(dds, normalized=FALSE)
md <- as.data.frame(colData(rld))
md$SampleID <- rownames(md)

if(colors[[1]] !='NA' & discrete[[1]] =='NA'){
    if (brewer.pal.info[colors[[1]],]$maxcolors >= length(unique(md[[Type]]))) {
        pal <- brewer.pal(length(unique(md[[Type]])),name=colors[[1]])
    } 
} else if(discrete[[1]] != 'NA' & length(discrete)==length(unique(md[[Type]]))){
        pal <- unlist(discrete)
} else {
        pal <- gg_color_hue(length(unique(md[[Type]])))
}


df1 <- melt(rawCounts) %>%
  dplyr::rename(Gene=Var1) %>%
  dplyr::rename(SampleID=Var2) %>%
  dplyr::rename(counts=value)

iv <- match(df1$SampleID, md$SampleID)
df1$Condition <- paste(md[iv,][[Type]])
df1$SampleID <- factor(df1$SampleID, levels=unique(md$SampleID))

# aesthetic for plots
dodge <- position_dodge(width = 0.6)
theme_update(plot.title = element_text(hjust = 0.5))

p1 <- ggplot(data=df1, mapping=aes(x=SampleID, y=counts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_y_log10() +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=6))

# width of pdf to ensure all sampleIDs are visible when exported to pdf
# This was generated with a use case of 16 samples and a width of 7 fitting well, the +8 is to account for the margins
if (nrow(md)<8) {
  width <- 6
} else {
  width <- 7/24*(nrow(md)+8)
}

# raw counts boxplot
pdf(rawCounts_out, width, 5)
print({
  p1
})
dev.off()

# faceted by condition
p2 <- ggplot(data=df1, mapping=aes(x=SampleID, y=counts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_y_log10() +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=4)) +
  facet_wrap(~Condition)

pdf(rawCounts_fac, 2*width, 5)
print({
  plot_grid(p1, p2)
})
dev.off()

# Run same analysis for log2-transformed normalized counts
df2 <- melt(assay(rld)) %>%
  dplyr::rename(Gene=Var1) %>%
  dplyr::rename(SampleID=Var2) %>%
  dplyr::rename(normCounts=value)

# Add Condition information to this dataframe
iv <- match(df2$SampleID, md$SampleID)
df2$Condition <- paste(md[iv,][[Type]])
df2$SampleID <- factor(df2$SampleID, levels=unique(md$SampleID))

p1 <- ggplot(data=df2, mapping=aes(x=SampleID, y=normCounts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=6)) +
  ylab("regularized log expression")

# raw counts boxplot
pdf(normCounts_out, width, 5)
print({
  p1
})
dev.off()

# faceted by condition
p2 <- ggplot(data=df2, mapping=aes(x=SampleID, y=normCounts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=4)) +
  facet_wrap(~Condition) +
  ylab("regularized log expression")

pdf(normCounts_fac, 2*width, 5)
print({
  plot_grid(p1, p2)
})
dev.off()

# Standard deviation vs. mean
ntd <- normTransform(dds)

pdf(sd_out)
meanSdPlot(assay(ntd))
dev.off()

# Generate annotation column for heatmap
if (length(subset_cols)==1) {
  annot <- as.data.frame(cbind(rownames(md), paste(md[[subset_cols[1]]])))
  names(annot) <- c("SampleID", subset_cols[1])
  rownames(annot) <- annot$SampleID
  annot$SampleID <- NULL
} else {
  annot <- md[,subset_cols]
}

# Remove rows with no standard deviation so that pheatmap executes properly
filt <- assay(rld)[apply(assay(rld), MARGIN = 1, FUN = function(x) sd(x) != 0),]

hm <- pheatmap(filt, show_rownames=F, clustering_distance_rows = "correlation", 
         clustering_distance_cols = "correlation", clustering_method = "average", 
         annotation_col = annot, scale = "row", 
         main="Unsupervised heatmap of all gene counts across samples",
         fontsize_row=4, fontsize_col=6, fontsize=8,
         color = colorRampPalette(c("navy", "white", "firebrick3"))(50))

save_pheatmap_pdf <- function(x, filename) {
  stopifnot(!missing(x))
  stopifnot(!missing(filename))
  pdf(filename)
  grid::grid.newpage()
  grid::grid.draw(x$gtable)
  dev.off()
}

save_pheatmap_pdf(hm, heatmap_out)

# use plotMA function from limma, then extract data from this variable to plot with ggplot2
p <- plotMDS(assay(rld), top = 1000)
df <- data.frame(x=p$x, y=p$y, name=names(p$x))
iv <- match(df$name, md$SampleID)
df$Condition <- paste(md[iv,][[Type]])

pdf(MDS_out)
ggplot(data=df, mapping=aes(x=x,y=y)) +
  geom_point(size=3, colour = "black", show.legend = TRUE) +
  geom_point(aes(color=Condition), size=2.2) +
  scale_colour_manual(values=pal) +
  xlab("Leading logFC dim 1") +
  ylab("Leading logFC dim 2") +
  ggtitle("MDS Plot")
dev.off()

# Export the table for MDS
write.table(df, file=MDS_table, sep="\t", quote=F, row.names=FALSE)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
library("data.table")
library("qvalue")

stats_table <- snakemake@input[['stats_table']]
cat(sprintf(c('stats table: ', stats_table, '\n')))


qplot <- snakemake@output[['qplot']]
cat(sprintf(c('Qvalue Output: ', qplot, '\n')))

qhist <- snakemake@output[['qhist']]
cat(sprintf(c('Qvalue hist Output: ', qhist, '\n')))

out_table = snakemake@output[['table']]
cat(sprintf(c('Summary results table', out_table,'\n')))

stats_frame = read.table(stats_table, row.names=1, sep='\t', check.names=F)

qobj = qvalue(p=stats_frame$pvalue, fdr.level=T)

stats_frame$qvalues = qobj$qvalues
stats_frame$lfdr = qobj$lfdr
write.table(as.data.frame(stats_frame), file=out_table, quote=FALSE, sep='\t')

pdf(qplot)
plot(qobj)
dev.off()

pdf(qhist)
hist(qobj)
dev.off()
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
library(ggplot2)
library(ggrepel)

degFile = snakemake@input[['degFile']]

FC <- snakemake@params[['FC']]

adjp <- snakemake@params[['adjp']]

contrast <- snakemake@params[['contrast']]

baseline <- contrast[[2]]

target <- contrast[[1]]

volcano_plot=snakemake@output[['volcano_plot']]

upCol = "#FF9999"
downCol = "#99CCFF"
ncCol = "#CCCCCC"

ens2geneID <- snakemake@config[['ens2geneID']]

##----------load differentially expressed genes --------#
print("Loading differential expressed gene table")
print(degFile)

## check if an rda file or tab sep
deg <- read.delim(file=degFile)

head(deg)
dim(deg)

## set all NA missing p-values to 1 (NA is DESeq2 default)
deg[is.na(deg$padj), "padj"] <- 1

## select up regulated genes
up <- deg$padj < adjp & deg$log2FoldChange > log2(FC)
sum(up)

## select down regulated genes
down <- deg$padj < adjp & deg$log2FoldChange < -log2(FC)
sum(down)

## Replace ensemble id's with gene id's
gene_id = read.delim(ens2geneID)

## Remove unique identifier .xx from heatmap data
rownames(deg) <- sub("\\.[0-9]*", "", rownames(deg))
iv <- match(rownames(deg), gene_id$ensembl_gene_id)
head(gene_id[iv,])

## assign the rownames of plot_LG to their external gene name using a variable, where we have indexed the 
## row number for all matches between these two dataframes
## Use paste to get rid of factors of this column, and just paste the value of the gene name
deg$Gene <- paste(gene_id[iv, "external_gene_name"])

# Grab the top 5 up and down regulated genes to label in the volcano plot
if (sum(up)>5) {
  upGenesToLabel <- head(deg[up,]$Gene, 5)
} else if (sum(up) %in% 1:5) {
  upGenesToLabel <- deg[up,]$Gene
}

if (sum(down)>5) {
  downGenesToLabel <- head(deg[down,]$Gene, 5)
} else if (sum(down) %in% 1:5) {
  downGenesToLabel <- deg[down,]$Gene
}

## calculate the -log10(adjp) for the plot
deg$log10padj <- -log10(deg$padj)

# assign up and downregulated genes to a category so that they can be labeled in the plot
deg$Expression <- ifelse(down, 'down',
                  ifelse(up, 'up','NS'))
deg$Expression <- factor(deg$Expression, levels=c("up","down","NS"))

# Assign colours to conditions
if (sum(up)==0 & sum(down)==0) {
  colours <- ncCol
} else if (sum(up)==0) {
  colours <- c(downCol, ncCol)
} else if (sum(down)==0) {
  colours <- c(upCol, ncCol)
} else {
  colours <- c(upCol, downCol, ncCol)
}

# Set all Infinity values to max out at 500 so that all points are contained in the plot
if ("Inf" %in% deg$log10padj) {
  deg$log10padj[deg$log10padj=="Inf"] <- max(deg[is.finite(deg$log10padj),"log10padj"]) + 2
}

# Assign genes to label based on whether genes are DE or not
if (exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- c(downGenesToLabel, upGenesToLabel)
} else if (exists("downGenesToLabel") & !exists("upGenesToLabel")) {
  genesToLabel <- downGenesToLabel
} else if (!exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- upGenesToLabel
}

if (exists("genesToLabel")) {
  p <- ggplot(data=deg, mapping=aes(x=log2FoldChange, y=log10padj, colour=Expression)) +
    geom_vline(xintercept = c(-log2(FC),log2(FC)), linetype="dashed", colour="gray45") +
    geom_hline(yintercept = -log10(adjp), linetype="dashed", colour="gray45") +
    geom_label_repel(aes(label=ifelse(Gene %in% genesToLabel, as.character(Gene),'')),box.padding=0.1, point.padding=0.5, segment.color="gray70", show.legend=FALSE) +
    geom_point() +
    ylab("-log10(FDR)") +
    xlab("log2(Fold Change)") +
    ggtitle(paste(target, "vs", baseline)) +
    scale_colour_manual(values=colours) +
    theme(plot.title = element_text(hjust = 0.5, face="plain"),
          axis.title.x = element_text(size=11),
          axis.title.y = element_text(size=11),
          panel.background = element_blank(),
          axis.line = element_line(colour = "gray45"),
          legend.key = element_rect(fill = "gray96"),
          legend.text = element_text(size = 10))
} else {
  p <- ggplot(data=deg, mapping=aes(x=log2FoldChange, y=log10padj, colour=Expression)) +
    geom_vline(xintercept = c(-log2(FC),log2(FC)), linetype="dashed", colour="gray45") +
    geom_hline(yintercept = -log10(adjp), linetype="dashed", colour="gray45") +
    geom_point() +
    geom_vline(xintercept = c(-log2(FC),log2(FC)), linetype="dashed", colour="gray45") +
    geom_hline(yintercept = -log10(adjp), linetype="dashed", colour="gray45") +
    ylab("-log10(FDR)") +
    xlab("log2(Fold Change)") +
    ggtitle(paste(target, "vs", baseline)) +
    scale_colour_manual(values=colours) +
    theme(plot.title = element_text(hjust = 0.5, face="plain"),
          axis.title.x = element_text(size=11),
          axis.title.y = element_text(size=11),
          panel.background = element_blank(),
          axis.line = element_line(colour = "gray45"),
          legend.key = element_rect(fill = "gray96"),
          legend.text = element_text(size = 10))
}


pdf(volcano_plot)
print({
  p
})
dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
library(Glimma)
library(limma)
library(DESeq2)

project_id = snakemake@params[['project_id']]

rds = snakemake@input[['rds']]
cat(sprintf(c('RDS object: ',rds,'\n')))


out_path = file.path(getwd(),'results','diffexp')
dir.create(out_path)

rds = readRDS(rds)
groups.df = as.data.frame(colData(rds))
glMDSPlot(rds, top = 1000, groups=groups.df,path=out_path,html=paste(project_id,'mds_plot',sep='.'),launch=FALSE)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
library(Glimma)
library(limma)
library(DESeq2)

condition = snakemake@params[['condition']]
cat(sprintf(c('Condition: ',condition,'\n')))
Type <- snakemake@config[['linear_model']]

title = snakemake@params[["contrast"]]

contrast = c(condition, snakemake@params[["contrast"]])
rds = snakemake@input[['rds']]
cat(sprintf(c('RDS object: ',rds,'\n')))

ens2geneID <- snakemake@config[['ens2geneID']]

out_path = file.path(getwd(),'results','diffexp')
dir.create(out_path)
print(out_path)
rds = readRDS(rds)
groups.df = as.data.frame(colData(rds))


#### by contrasts
contrasts_to_plot = resultsNames(rds)
res <- results(rds, contrast=contrast)
res$padj[is.na(res$padj)] = 1

rnaseq = as.data.frame(counts(rds, normalized=T))

## Replace ensemble id's with gene id's
gene_id = read.delim(ens2geneID)

## Remove unique identifier .xx from heatmap data
rownames(res) <- sub("\\.[0-9]*", "", rownames(res))
iv <- match(rownames(res), gene_id$ensembl_gene_id)
head(gene_id[iv,])

res$GeneID  <- paste(gene_id[iv, "external_gene_name"])

res <- res[order(res$padj),]
# Remove duplicated geneIDs (this will, be default, remove the ones with the higher p-values, as we have ordered by p-value above
res <- res[!duplicated(res$GeneID),]

# Also subset your rnaseq counts table by these ensembl IDs so all of the data matches
rownames(rnaseq) <- sub("\\.[0-9]*", "", rownames(rnaseq))
rnaseq <- rnaseq[rownames(rnaseq) %in% rownames(res),]
# Now match gene symbols to these ensembl IDS
iv <- match(rownames(rnaseq), gene_id$ensembl_gene_id)
head(gene_id[iv,])
# Add gene symbols as rownames
rownames(rnaseq)  <- paste(gene_id[iv, "external_gene_name"])

# Now remove old ensembl IDs from results and paste gene symbol there
rownames(res) <- res$GeneID
res$GeneID <- NULL

genes = as.data.frame(row.names(res))
colnames(genes) = 'GeneID'

status_frame = res[,c('log2FoldChange','padj')]
status_frame['status'] = 0
status_frame$padj[is.na(status_frame$padj)] = 1
status_frame[status_frame$padj<0.05 & status_frame$log2FoldChange < 0 ,'status'] = -1
status_frame[status_frame$padj<0.05 & status_frame$log2FoldChange > 0 ,'status'] = 1

title = paste(title[1],'vs',title[2],sep='-')

glMDPlot(res, anno=genes, status=status_frame$status, samples=colnames(rnaseq), 
         counts=log2(rnaseq + 0.0001), groups=groups.df[[Type]], main=strsplit(res@elementMetadata$description[2],': ')[[1]][2], 
         transform=F, side.ylab='Log2-expression',launch=FALSE,side.main='GeneID', html = paste(title,'ma_plot',sep='.'),path=out_path)

## Volcano plot
glXYPlot(x=res$log2FoldChange, y=-log10(res$pvalue), xlab="logFC", ylab="logodds",path=out_path,
         status=status_frame$status, launch=FALSE,counts=log2(rnaseq + 0.0001), groups=groups.df[[Type]], anno=genes,main=strsplit(res@elementMetadata$description[2],': ')[[1]][2],html = paste(title,'volcano_plot',sep='.'))
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
degFile = snakemake@input[['degFile']]

assembly <- snakemake@params[['assembly']]

FC <- snakemake@params[['FC']]

adjp <- snakemake@params[['adjp']]

printTree <- snakemake@params[['printTree']]

library(GO.db)
library(topGO)
library(ggplot2)
library(RColorBrewer)
library(biomaRt)
library(GenomicFeatures)
library(Rgraphviz)

##----------load differentially expressed genes --------#
print("Loading differential expressed gene table")
print(degFile)

deg <- read.delim(file=degFile,header=TRUE,sep="\t")
rownames(deg) <- sub("\\.[0-9]*","",rownames(deg))

##---------load correct Biomart------------------------#
if (assembly == "hg19") {
    organismStr <- "hsapiens"
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg19.Ens_75.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.89") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 89!
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg38.Ens_89.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.90") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg38.Ens_90.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "mm10") {
    organismStr <- "mmusculus"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("./anno/biomaRt/hg38.Ens_90.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}


##-----------------------------------Functions--------------------------------------#
runGO <- function(geneList,xx=xx,otype,setName){
    setLength       <- sum(as.numeric(levels(geneList))[geneList]) 
    fname           <- paste(Dir, paste(setName, otype, "GO.txt", sep="_"), sep="/")
    GOData          <- new("topGOdata", ontology=otype, allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
    resultFisher    <- runTest(GOData, algorithm = "classic", statistic = "fisher")## statistical test for topGO
    x               <- GenTable(GOData, classicFisher=resultFisher, topNodes=length(names(resultFisher@score)))## make go table for all terms
    x               <- data.frame(x)
    pVal            <- data.frame(pval=signif(resultFisher@score, 6)) ## get unrounded pvalue
    x$enrich        <- x$Significant/x$Expected ## calculate enrichment based on what you expect by chance
    x$p.unround     <- pVal[x$GO.ID,"pval"]## put unrounded pvalue in the table
    x$p.adj         <- signif(p.adjust(x$p.unround, method="BH"), 6)## calculate the adjusted pvalue with Benjamini & Hochberg correction
    x$log.p.adj     <- -log10(x$p.adj) ## convert adjusted p value to -log10 for plot magnitude
    #x$Term.full     <- sapply(x$GO.ID, FUN=function(n){Term(xx[[n]])}) ## get the full term name
    x <- x[order(x$GO.ID),]
    write.table(x, file=fname, sep="\t", col.names=TRUE, quote=FALSE, row.names=FALSE) ## save the table
    ## you can print the tree if you want, but since I keep the list of all of them skip
    if(printTree>0){
        printGraph(GOData,## make the tree for the go data
                   resultFisher,
                   firstSigNodes = 5,
                   fn.prefix = sub("_GO.txt$", "", fname),
                   useInfo = "all",
                   pdfSW = TRUE
                   )
    }
    return(x)  
}

## function to make barplot of -log10 adjusted pvalues colored by enrichment

drawBarplot <- function(go, ontology, setName){
    go <- go[!go$p.adj > 0.01,]
    if(nrow(go)>1){
        #go$Term.full <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term.full,1,50)), "\\,"), `[`, 1)))
        go$Term <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term,1,50)), "\\,"), `[`, 1)))
        print(setName)
        go <- go[with(go, order(p.adj, -enrich)),]
        ## Currently there is a discrepency between xx and x, so we only use Term right now, not Term.full
        #go$Term.full <-factor(paste(go$Term.full), levels=rev(paste(go$Term.full))) ## sort table by adjusted p-value
        go$Term <-factor(paste(go$Term), levels=rev(paste(go$Term))) ## sort table by adjusted p-value
        ptitle <- paste(ontology, setName) ## plot title
        ptitle <- gsub("^.*/","",ptitle)
        pfname <- paste(setName,ontology,"pdf",sep=".")## name of png file
        if(nrow(go) < 20 ){
            toprange <- 1:nrow(go)
        }else{
            toprange <- 1:20
        }
        top <- go[toprange,]
        col <- colorRampPalette(c("white","navy"))(16)   
        pdf(file=paste(Dir, pfname, sep="/"),height=5,width=7)
        print({
           p <- ggplot(top, aes(y=log.p.adj, x=Term, fill=enrich)) + ## ggplot barplot function
               geom_bar(stat="identity",colour="black") +
               ggtitle(ptitle) +
               xlab("") + ylab("-log10(fdr)") +
               scale_fill_gradient(low=col[2], high=col[15], name="enrichment", limits=c(0,ceiling(max(top$enrich))))+
               coord_flip()+
               theme(panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_blank(), 
                     panel.background = element_blank(), axis.line = element_line(colour = "black"))+
               theme(text = element_text(size=8),
                     axis.text.x = element_text(vjust=1,color="black",size=8),
                     axis.text.y = element_text(color="black",size=8),
                     plot.title=element_text(size=10))   
       })
       dev.off()
    }
}

print("get up genes and make geneList")
up <- deg$padj < adjp & deg$log2FoldChange >= log2(FC)
up <- unique(rownames(deg[up,]))
all <-unique(names(geneID2GO))
up.geneList <-  factor(as.integer(all %in% up))
names(up.geneList) <- all

up.setsize <- sum(as.numeric(levels(up.geneList))[up.geneList])
print("setsize for significant genes") 
up.setsize

adjplabel <- gsub("^0\\.","",adjp)
comparison <- gsub("\\.tsv$|\\.txt$|\\.rda$|\\.RData$","",degFile)

Dir <- sub("$", "/GOterms", dirname(comparison))
if(!(file.exists(Dir))) {
      dir.create(Dir,FALSE,TRUE)
}

if (up.setsize >=2) {

print("make GO table for the up genes")
#################################
go.UP.BP <- runGO(geneList=up.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))
#go.UP.MF <- runGO(geneList=up.geneList,xx=xx,otype="MF",setName=paste(comparison,"up",sep="."))

print("make the png for the up genes")
drawBarplot(go=go.UP.BP,ontology="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))

} else {
up_out = snakemake@output[[1]]
write.table('No Significant Genes', file=up_out)
}

print("get down genes and make geneList")
dn <- deg$padj < adjp & deg$log2FoldChange <= -log2(FC)
dn <- unique(rownames(deg[dn,]))
all <-unique(names(geneID2GO))
dn.geneList <-  factor(as.integer(all %in% dn))
names(dn.geneList) <- all

dn.setsize <- sum(as.numeric(levels(dn.geneList))[dn.geneList])
print("setsize for significant genes") 
dn.setsize

if(dn.setsize >= 2){

print("make GO table for down genes")
go.DN.BP <- runGO(geneList=dn.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))

print("make barplot for down genes")
drawBarplot(go=go.DN.BP,ontology="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))

}else{
down_out = snakemake@output[[2]]
write.table('No Significant Genes', file=down_out)
}
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
region = snakemake.params.get("region", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


shell("samtools stats {extra} {snakemake.input}"
      " {region} > {snakemake.output} {log}")
ShowHide 33 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/ohsu-cedar-comp-hub/cfRNA-seq-pipeline-Ngo-manuscript-2019
Name: cfrna-seq-pipeline-ngo-manuscript-2019
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...