A Snakemake workflow to analyse and visualise Illumina Infinium Methylation arrays

public public 1yr ago 0 bookmarks

A Snakemake workflow to analyse Illumina methylation arrays

Contents

Code Snippets

26
27
script:
    "../scripts/annotate.R"
28
29
script:
    "../scripts/dmrcatecpg.R"
54
55
script:
    "../scripts/dmrcatedmr.R"
22
23
script:
    "../scripts/pca.R"
SnakeMake From line 22 of rules/eda.smk
44
45
script:
    "../scripts/densityheatmap.R"
SnakeMake From line 44 of rules/eda.smk
27
28
script:
    "../scripts/filter.R"
21
22
script:
    "../scripts/normalise.R"
20
21
script:
    "../scripts/import.R"
36
37
script:
    "../scripts/getmset.R"
55
56
script:
    "../scripts/getrset.R"
71
72
script:
    "../scripts/getgrset.R"
23
24
script:
    "../scripts/densityqc.R"
43
44
script:
    "../scripts/densityqc.R"
62
63
script:
    "../scripts/boxplotqc.R"
19
20
script:
    "../scripts/controlstripqc.R"
38
39
script:
    "../scripts/detpqc.R"
54
55
script:
    "../scripts/msetqc.R"
74
75
script:
    "../scripts/densityqc.R"
94
95
script:
    "../scripts/densityqc.R"
113
114
script:
    "../scripts/boxplotqc.R"
SnakeMake From line 113 of rules/qcpre.smk
17
18
shell:
    "wget https://zhouserver.research.chop.edu/InfiniumAnnotation/current/{params.array}/{params.array}.{params.organism}.manifest.rds -O resources/{params.array}.{params.organism}.manifest.rds"
30
31
shell:
    "wget https://github.com/sirselim/illumina450k_filtering/blob/master/48639-non-specific-probes-Illumina450k.csv -O {output.probes}"
36
37
script:
    "../scripts/rankplot.R"
60
61
script:
    "../scripts/tracks.R"
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
addAnno <- function(dmrs, outputLoc = "nearestLocation", featureLocForDistance="TSS", 
                    bindingRegion=c(-2000, 2000), organism = "hg38"){

    library(GenomicRanges)
    library(ChIPpeakAnno)
    library(org.Hs.eg.db)

    dmrs = GRanges(dmrs)

    if(organism == "hg38"){   

        library(TxDb.Hsapiens.UCSC.hg38.knownGene)

        annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg38.knownGene)

    } 

    if(organism == "hg19"){

        library(TxDb.Hsapiens.UCSC.hg19.knownGene)

        annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene)

    }

    seqlevelsStyle(dmrs) <- seqlevelsStyle(annoData)

    anno_dmrs <- annotatePeakInBatch(dmrs, AnnotationData = annoData, 
                                    output = outputLoc, 
                                    FeatureLocForDistance = featureLocForDistance,
                                    bindingRegion = bindingRegion)



    anno_dmrs$symbol <- xget(anno_dmrs$feature, org.Hs.egSYMBOL)

    return(anno_dmrs)

}


main <- function(input, output, params, log) {

    # Log

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script

    library(minfi)
    library(DMRcate)
    library(rtracklayer)

    dmrs <- readRDS(input$rds)

    # params
    outputLoc <- params$output # "nearestLocation"
    featureLocForDistance <- params$featureLocForDistance # "TSS"
    bindingRegion <- params$bindingRegion  # c(-2000, 2000)
    organism <- params$organism

    # output 
    save <- output$csv

    # run annotation
    dmrs = addAnno(dmrs, outputLoc, featureLocForDistance, bindingRegion, organism)

    # save output

    write.csv(as.data.frame(dmrs), save)

    rtracklayer::export(dmrs, output$bed) 

    saveRDS(dmrs, file = output$rds)


}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
plotBoxplots <- function(beta, phenodata, fill, group = "Sample_Group"){

  t_beta = t(beta)
  t_beta = as.data.frame(t_beta, drop=FALSE)
  t_beta$Sample_name = phenodata$Sample_Name
  t_beta$Group = as.character(phenodata[[group]])
  melt_beta = melt(t_beta)

  colnames(melt_beta) = c("Sample","Group", "CpG", "B.Val" )

  bp <- ggplot(melt_beta, aes(x=Sample, y=B.Val, fill = Group)) + 
    geom_boxplot()+
    labs(title="B vals Samples Pre Norm",x="Sample", y = "B.Val") + 
    theme_classic() + coord_flip() + scale_fill_manual(values=fill)

  return(bp)


}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ggplot2)
  library(reshape2)

  GRset <- readRDS(input$rds)

  beta <- getBeta(GRset)

  phenodata <- pData(GRset)

  fill <- unlist(params$fill)

  # TODO See BW-31
  bp <- plotBoxplots(beta, phenodata, fill, group = "status")

  ggsave(output$pdf, plot = bp)


}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RGSet <- readRDS(input$rds)

  qcReport(RGSet, pdf= output$pdf)


}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
densityHeatmapWrap <- function(GRset, col, samples_names, fill, cluster_columns = TRUE, clustering_distance_columns = "ks"){

  names(fill) = pData(GRset)[, col]

  ha1 = HeatmapAnnotation(group = pData(GRset)[, col], col = list(group = fill))

  mat <- as.matrix(getBeta(GRset))

  colnames(mat) <- pData(GRset)[, samples_names]

  p <- densityHeatmap(mat, top_annotation = ha1, 
                      cluster_columns = cluster_columns, clustering_distance_columns = "ks", 
                      ylab = "Beta-values")

  return(p)

}


main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ComplexHeatmap)

  GRset <- readRDS(input$rds)

  samples_names <- params$samples_names

  col <- params$group

  fillList <- params$fill

  # Check if names supplied are in GRSet metadata
  stopifnot(names(fillList) %in% pData(GRset)[, col])

  fill <- sapply(pData(GRset)[, col], function(x, fillList){as.character(fillList[x])}, fillList = fillList)

  # ensure character vector in correct order for plotting
  stopifnot(names(fill) == pData(GRset)[, col])

  cluster_columns <- params$cluster_columns

  clustering_distance_columns <- params$clustering_distance_columns

  # Plot

  pdf(output$pdf)

  p <- densityHeatmapWrap(GRset, col, samples_names, fill, cluster_columns = params$cluster_columns, 
                          clustering_distance_columns = params$clustering_distance_columns)

  draw(p)

  dev.off()

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
getData <- function(GRset, type){

  if(type == "beta"){

    data = getBeta(GRset)

  }

  if(type == "M"){

    data = getM(GRset)

  }

  return(data)
}


plotDensity <- function(data, phenodata, output, fill, group = "Sample_Group"){


  ## Density plot
  pdf(output)

  densityPlot(data, sampGroups = phenodata[[group]], legend = F, pal = fill)

  legend("top", legend = levels(factor(phenodata[[group]])),
         text.col=fill)

  dev.off()



}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ggplot2)

  GRset <- readRDS(input$rds)

  data <- getData(GRset, params$type)

  phenodata <- pData(GRset)

  fill <- unlist(params$fill)

  # TODO See BW-31
  plotDensity(data, phenodata, output$pdf, fill, group = "status")


}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
qcPval <- function(detP,targets, output, fill, group = "Sample_Group"){

  pal <- fill

  data <- colMeans(detP)

  names(data) <- targets$Sample_Name

  pdf(output,width=12)

  par(mar = c(9,8,1,1))
  barplot(data, col = pal[factor(targets[[group]])], las = 2,
          cex.names = 0.8, ylim = c(0,0.02), ylab = "Mean detection p-values")
  abline(h=0.01,col = "red")
  legend("topright", legend = levels(factor(targets[[group]])), fill = pal,
         bg = "white")

  dev.off()

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  # tsv file location

  RGset <- readRDS(input$rds)

  detP <- detectionP(RGset)

  targets <- read.metharray.sheet(params$dir)

  fill <- unlist(params$fill)

  # TODO See BW-31
  qcPval(detP, targets, output$pdf, fill, group = "status")


}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
modelMatrix <- function(data) {

  # Get phenotype data

  names <- colnames(data)

  # Set condition factor

  if ("condition" %in% names) {

    condition <- factor(data$condition)

    n.condition <- nlevels(condition)

    is.condition <- n.condition > 1

  } else {

    is.condition <- FALSE

  }

  # Set batch factor

  if ("batch" %in% names) {

    batch <- factor(data$batch)

    n.batch <- nlevels(batch)

    is.batch <- n.batch > 1

  } else {

    is.batch <- FALSE

  }

  # Set block factor

  if ("block" %in% names) {

    block <- factor(data$block)

    n.block <- nlevels(block)

    is.block <- n.block > 1

  } else {

    is.block <- FALSE

  }

  # Construct design matrix

  if (is.condition & !is.batch & !is.block) {

    design <- model.matrix(~ 0 + condition)

  }

  if (is.condition & is.batch & !is.block) {

    design <- model.matrix(~ 0 + condition + batch)

  }

  if (is.condition & !is.batch & is.block) {

    design <- model.matrix(~ 0 + condition + block)

  }

  if (is.condition & is.batch & is.block) {

    design <- model.matrix(~ 0 + condition + batch + block)

  }

  # Rename condition coefficients

  which.condition <- seq_len(n.condition)

  colnames(design)[which.condition] <- levels(condition)

  # Required for DMRcate if using numeric coef
  # colnames(design)[1] <- "(Intercept)"

  # Return design matrix

  design

}

# Makes sure sample tsv is in the same order as methylation object coldata

checkOrder <- function(sampledata, rds, colname = "Sample_Name") {

  sampledata <- sampledata[match(colData(rds)[[colname]],sampledata$sample),]

  return(sampledata)

}


main <- function(input, output, params, log, config) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(DMRcate)
  library(rtracklayer)
  library(minfi)
  library(limma)

  GRset <- readRDS(input$rds)

  data <- read.table(params$samples, header = T)

  data <- checkOrder(data, GRset)

  mod <- modelMatrix(data)

  #### Run DMRCate 

  # Already ratio converted so no need to run below:
  # GRsetRatio <- ratioConvert(GRset)

  GRsetRatio <- GRset

  arraytype = params$arraytype

  # cpg.annotate expects array string as 450K but config specifies as HM450
  if(arraytype == "HM450"){

    arraytype = "450K"

  }

  analysis.type = params$analysistype

  # Changing to limma style contrast matrix makes easier to specify comparison explicitly in config
  # See BW-23
  con <- params$contrast

  # limma style contrast matrix
  cont.matrix <- makeContrasts(contrasts=con, levels=mod)

  # must be a column name in cont.matrix
  # Should correspond to contrast given as param con
  coef <- colnames(cont.matrix)[1]
  stopifnot(coef == con)

  fdr = params$fdr

  # See BW-23 Added cont.matrix = cont.matrix
  # Allowed us to remove intercept from desing matrix
  # Otherwise design matrix must have intercept and coef must be numeric
  cpg.annotation <- cpg.annotate("array", GRsetRatio, arraytype = arraytype, cont.matrix = cont.matrix,
                               analysis.type = analysis.type, design = mod, coef = coef,
                               contrasts = TRUE, fdr = fdr)


  write.csv(as.data.frame(cpg.annotation@ranges), file = output$csv, quote = F)

  saveRDS(cpg.annotation, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@config)
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
liftover <- function(results.ranges, chainfile){

  # hg19ToHg38.over.chain from https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/

  ch <- import.chain(chainfile)

  results <- rtracklayer::liftOver(results.ranges, ch)

  results <- unlist(results)

  return(results)
}

main <- function(input, output, params, log, config) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(DMRcate)
  library(rtracklayer)
  library(minfi)
  library(ExperimentHub)

  cpg.annotation <- readRDS(input$rds)

  # Below are default values
  # fdr cutoff default applied and advised by authors
  # see opt pcutoff = "fdr"
  dmrcoutput <- dmrcate(cpg.annotation, lambda=1000, C=2)

  results.ranges <- extractRanges(dmrcoutput, genome = "hg19")
  #results.ranges <- dmrcateExtractRanges(dmrcoutput, genome = "hg19")

  seqlevelsStyle(results.ranges) <- "UCSC"

  # chainfile <- params$chainfile
  chainfile <- input$chainfile

  if(params$liftover){

    results <- liftover(results.ranges, chainfile)

  } else {

    results <- results.ranges

  }

  # order by fdr of choice

  fdr <- params$fdr

  results = results[order(mcols(results)[[fdr]]),]

  write.csv(as.data.frame(results), file = output$csv, quote = F)

  saveRDS(results, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@config)
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
filterByDetP <- function(GRset, RGset){

  detP <- detectionP(RGset)

  detP <- detP[match(featureNames(GRset),rownames(detP)),]

  # remove any probes that have failed in one or more samples
  keep <- rowSums(detP < 0.01) == ncol(GRset)

  # filter out bad probes
  GRset <- GRset[keep,]

  return(GRset)
}

filterBySexChrom <- function(GRset){

  annEPIC <- minfi::getAnnotation(GRset)
  keep <- !(featureNames(GRset) %in% annEPIC$Name[annEPIC$chr %in% c("chrX","chrY")])

  GRset <- GRset[keep,]

  return(GRset)

}

filterBySnpProbes <- function(GRset){

  GRset <- dropLociWithSnps(GRset, snps = c("SBE","CpG"), maf = 0)

  return(GRset)

}

filterByXReactiveProbes <- function(GRset, xprobes){

  xReactiveProbes <- xprobes

  keep <- !(featureNames(GRset) %in% xReactiveProbes$TargetID)

  GRset <- GRset[keep,]

  return(GRset)

}

filterByExtendedAnno <- function(GRset, anno){

  ### Drop all probes that are cross reactive, dont map to hg38/hg19, extended SNP annotation (see Nucleic acid research paper)
  cpg_mask <- names(anno)[anno$MASK_general == TRUE]

  keep <- !(featureNames(GRset) %in% cpg_mask)

  GRset <- GRset[keep,]

  return(GRset)

}

getAnno <- function(annoType = "hg38", array = "HM450", static = TRUE){


  # anno <- sesameDataGetAnno("EPIC/EPIC.hg38.manifest.rds")

  if(static){

    anno <- readRDS(paste0("resources/", array, ".", annoType, ".manifest.rds"))

  } 
  if(!static) {

    sesameDataCache(array)

    anno <- sesameDataGetAnno(paste0(array, "/", array, ".", annoType, ".manifest.rds"))
  }

  return(anno)

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  #library(sesameData)

  GRset <- readRDS(input$rds)

  # or could do hg38Anno <- sesameDataGetAnno("EPIC/EPIC.hg38.manifest.rds")
  # anno <- readRDS(params$anno)

  # Have gone with downloading the annotation per analysis - but can add to /resources if too slow
  anno <- getAnno(params$anno, params$array, params$static)

  RGset <- readRDS(params$rgset)

  # "48639-non-specific-probes-Illumina450k.csv"
  # xprobes <- read.csv(file=params$xprobes, stringsAsFactors=FALSE)
  xprobes <- read.csv(file=input$xprobes, stringsAsFactors=FALSE)

  GRsetFilt <- filterByDetP(GRset, RGset)
  GRsetFilt <- filterBySexChrom(GRset)
  GRsetFilt <- filterBySnpProbes(GRset)
  GRsetFilt <- filterByXReactiveProbes(GRset, xprobes)
  GRsetFilt <- filterByExtendedAnno(GRset, anno)

  # Note we may like to add additional filtering based on updates to manifest from Illumina:
  # See https://github.com/sirselim/illumina450k_filtering

  saveRDS(GRsetFilt, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RSet <- readRDS(input$rds)

  GRset <- mapToGenome(RSet)

  saveRDS(GRset, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RGset <- readRDS(input$rds)

  MSet <- preprocessRaw(RGset) 

  saveRDS(MSet, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  MSet <- readRDS(input$rds)

  RSet <- ratioConvert(MSet, what = params$what, keepCN = params$keepCN)

  saveRDS(RSet, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  targets <- read.metharray.sheet(input$dir)

  sample_table <- read.table(input$samples, header = T)

  # match targets with sample table
  targets <- targets[match(sample_table[,"sample"], targets[,"Sample_Name"]),]

  stopifnot(targets[,"Sample_Name"] == sample_table[,"sample"])

  # add sample table to targets
  targets <- cbind.data.frame(targets, sample_table)

  RGset <- read.metharray.exp(targets = targets)

  saveRDS(RGset, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(ggplot2)

  # tsv file location

  MSet <- readRDS(input$rds)

  qc <- getQC(MSet)

  pdf(output$pdf)

  plotQC(qc)

  dev.off()

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
normalise <- function(RGset, type) {

  if(type == "preprocessFunnorm"){

    # Returns a GRset

    obj <- preprocessFunnorm(RGset)

  }

  if(type == "preprocessSWAN"){

    # NOTE - this returns an Mset not a GRset

    obj <- preprocessSWAN(RGset)

  }

  if(type == "preprocessQuantile"){

    # Returns a GRset

    obj <- preprocessQuantile(RGset)

  }

  if(type == "preprocessNoob"){

    # NOTE - this returns an Mset not a GRset

    obj <- preprocessNoob(RGset)

  }

  if(type == "preprocessIllumina"){

    # NOTE - this returns an Mset not a GRset
    # Also not recommended type of normalisation

    obj <- preprocessIllumina(RGset)

  }

  return(obj)
}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)

  RGset <- readRDS(input$rds)

  print(sessionInfo())
  GRset <- normalise(RGset, type = params$type)

  saveRDS(GRset, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
plotPCA <- function(x, ...) {

  UseMethod("plotPCA")

}

plotPCA.GenomicRatioSet <- function(x, col, fill) {

  mat <- getBeta(x)

  var <- matrixStats::rowVars(mat)

  num <- min(500, length(var))

  ind <- order(var, decreasing = TRUE)[seq_len(num)]

  pca <- prcomp(t(mat[ind, ]))

  pct <- (pca$sdev ^ 2) / sum(pca$sdev ^ 2)

  dat <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], group = pData(x)[, col])

  p <- ggplot(dat, aes_string(x = "PC1", y = "PC2", color = "group")) + 
    geom_point(size = 3) + 
    xlab(paste0("PC1: ", round(pct[1] * 100), "% variance")) + 
    ylab(paste0("PC2: ", round(pct[2] * 100), "% variance")) + 
    coord_fixed() + scale_colour_manual(values=fill)

  return(p)
}

main <- function(input, output, params, log) {

  # Log function

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script function

  library(ggplot2)

  library(minfi)

  x <- readRDS(input$rds)

  # convert fill to named vector as preserves names
  fill = unlist(params$fill)

  p <- plotPCA(x, col = params$group, fill = fill)

  ggsave(output$pdf, plot = p)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
prepRes <- function(results, yaxis = "meandiff", symbol = "overlapping.genes"){

  results <- as.data.frame(mcols(results)[,c(symbol, yaxis)])

  # Change colnames (expects data parsed as symbol, enrichment)

  colnames(results) <- c("symbol","score")

  # Rank by value

  results <- results[order(-results$score),]

  results$rank <- rep(1:length(results$score))

  return(results)
}

#### Mean diff rank plots 
# ranked_data assumed to be 2 col df with gene symbol and enrichment score titled symbol and score 
plotRank <- function(ranked_data, save, genes_of_interest = NULL, 
                     textsize = 0.8, pad = 2, pointsize = 1, highlight = 30, 
                     title = "Rank Plot", yaxis = "meandiff", number_show = 10, num_score_thres = 0.5){

  library(plyr)
  library(ggrepel)
  library(ggplot2)

  # Define gene names of interest if none provided

  if (is.null(genes_of_interest)){

    genes_of_interest <- c(head(ranked_data$symbol, (number_show/2)), tail(ranked_data$symbol, (number_show/2)))

    genes_of_interest <- genes_of_interest[!is.na(genes_of_interest)]
  }

  # Define state column for plotting of colour points and labelled points

  ranked_data$state = rep(FALSE,length(rownames(ranked_data)))

  ranked_data$state = ifelse(ranked_data$rank %in% 1:highlight | ranked_data$rank %in% (length(ranked_data$rank)-highlight):length(ranked_data$rank), "Yes", "No")

  # This will also pick up DMRs with associated genes that are not unique (multiple peaks per gene if they are within the top hits)
  ranked_data$state = ifelse(ranked_data$symbol %in% genes_of_interest & abs(ranked_data$score) > num_score_thres, "Highlight", ranked_data$state)

  # Write out underlying CSV file of rank in case required downstream

  write.table(as.data.frame(ranked_data), file = save, quote = F, sep = "\t", row.names = FALSE)

  # Ggplot obj

  num_highlight <- round(min(number_show, nrow(ranked_data[ranked_data$state == "Highlight",]))/2)
  highlight_df <- rbind.data.frame(head(ranked_data[ranked_data$state == "Highlight",], num_highlight), tail(ranked_data[ranked_data$state == "Highlight",], num_highlight))

  p = ggplot(ranked_data, aes(rank, score)) +
        geom_point(aes(col=state), size = pointsize, alpha = 0.5) + 
        geom_point(data = subset(ranked_data, state %in% c('Yes')), aes(col = state), size = pointsize, alpha = 0.05) + 
        geom_point(data = subset(ranked_data, state %in% c("Highlight")), aes(col = state), size = pointsize, alpha = 0.5) +
        scale_color_manual(values = c("#f05b5b", "#828282", "#5b88f0"), guide = FALSE) +
        geom_text_repel(data = highlight_df, aes(label = symbol), size = textsize, point.padding = pad) + 
        geom_rect(data = head(ranked_data, 1), aes(ymin=-num_score_thres, ymax=num_score_thres, xmin=-Inf, xmax=Inf), alpha= 0.5) +
        geom_hline(yintercept = 0, linetype="solid", color = "black", size=0.5, alpha= 0.5) + 
        geom_hline(yintercept = -num_score_thres, linetype="dashed", color = "black", size=0.5, alpha= 0.5) + 
        geom_hline(yintercept = num_score_thres, linetype="dashed", color = "black", size=0.5, alpha= 0.5) + 
        ggtitle(title) + 
        labs(y = yaxis, x = "Rank") + 
        theme_classic() + 
        theme(plot.title = element_text(hjust = 0.5, size=rel(1.6)) ,
              axis.text.x = element_text(angle = 45, vjust = 1, hjust=1, size=rel(1.6)), 
              axis.title.x = element_text(size=rel(1.6)),
              axis.text.y = element_text(size=rel(1.6)),
              axis.title.y = element_text(size=rel(1.6)))

  return(p)

}

main <- function(input, output, params, log) {

  # Log

  out <- file(log$out, open = "wt")

  err <- file(log$err, open = "wt")

  sink(out, type = "output")

  sink(err, type = "message")

  # Script

  library(minfi)
  library(DMRcate)

  results <- readRDS(input$rds)

  # Define params for rank plot function
  # note dmrcate results file for ranking is names meandiff (in the past it was meanbetafc)
  save <- output$tsv
  genes_of_interest <- params$genes_of_interest
  width <- params$width
  height <- params$height
  textsize <- params$textsize
  pad <- params$pad
  pointsize <- params$pointsize
  highlight <- params$highlight
  title <- params$title
  yaxis <- params$yaxis
  number_show <- params$number_show
  num_score_thres <- params$num_score_thres

  symbol <- params$symbol # overlapping.genes

  # Run prep results table 
  ranked_data <- prepRes(results, yaxis, symbol = symbol)

  # Plot rank plot
  p <- plotRank(ranked_data, save, genes_of_interest,
                     textsize, pad, pointsize, highlight, 
                     title, yaxis, number_show, num_score_thres)

  ggsave(output$pdf, plot = p, width = width,  height = height)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
getTrackObj <- function(filter, anno = "hg38", array = "HM450", combine = "mean", by = "status"){

  library(TxDb.Hsapiens.UCSC.hg38.knownGene)

  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

  manifest <- readRDS(paste0("resources/", array, ".", anno, ".manifest.rds"))

  # only keep cpgs which are in final filtered Genomic Ratio set
  keep <- names(manifest) %in% featureNames(filter)

  manifest <- manifest[keep,]

  # Get Beta table from GRSet

  beta <- getBeta(filter)

  # match order of beta table with manifest meta data

  beta <- beta[match(names(manifest), rownames(beta)),]

  # check rownames equal

  stopifnot(rownames(beta) == names(manifest))

  # Add new colnames to beta table matching colData

  stopifnot(colnames(beta) == rownames(colData(filter)))

  colnames(beta) <- colData(filter)$Sample_Name

  # Add beta signal to tracks GRanges mcols instead of other data

  tracks <- manifest 

  mcols(tracks) <- beta

  # Turn into list of separate GRanges objects

  if (is.null(combine)) {

    tracksList <- lapply(colnames(mcols(tracks)), function(x, tracks){tracks[, colnames(mcols(tracks)) == x] } , tracks = tracks)

    names(tracksList) = colnames(mcols(tracks))

    tracksList <- lapply(tracksList, filterTrackOverlaps)


  } else {

    combine_by <- unique(colData(filter)[[by]])

    tracksList <- lapply(combine_by, combineBeta, tracks = tracks, colData=colData(filter), by = by, combine = combine)

    names(tracksList) = combine_by

  }


  return(tracksList)
}

# Combine samples together based on colData col name and label into combined tracks

combineBeta <- function(label, tracks, colData, by, combine = "mean", samplename = "Sample_Name"){

  library(TxDb.Hsapiens.UCSC.hg38.knownGene)

  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

  samples_int <- colData[colData[[by]] %in% label,][[samplename]]

  if(combine == "mean"){

    combineMeta <- rowMeans(as.data.frame(mcols(tracks[, colnames(mcols(tracks)) %in% samples_int])))

  } 

  if(combine == "median"){

    combineMeta <- rowMedians(as.data.frame(mcols(tracks[, colnames(mcols(tracks)) %in% samples_int])))

  }

  tracks_new <- tracks

  mcols(tracks_new) <- combineMeta

  tracks_new <- filterTrackOverlaps(tracks_new)

  return(tracks_new)

}

# Parse GRranges object to ensure ready for output
# Ensures Beta col is labelled score
# Removes indeterminate chrs (*)
# Removes Cpg sites which overlap boundaries - this should not be the case with any CpG GRanges obj but rtracklayer also does not want them to share boundaries

filterTrackOverlaps <- function(tracks){

    library(TxDb.Hsapiens.UCSC.hg38.knownGene)

    txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

    colnames(mcols(tracks)) <- "score"

    seqlevelsStyle(tracks) <- "UCSC"

    seqlevels(tracks) <- seqnames(seqinfo(tracks))[seqnames(seqinfo(tracks)) != "*"]

    seqinfo(tracks) <- seqinfo(txdb)[seqnames(seqinfo(tracks))[seqnames(seqinfo(tracks)) != "*"]]

    # take tracks which share boundaries and remove them
    tracks <- tracks[!tail(start(tracks), -1) <= head(end(tracks), -1)]

    # resort
    tracks <- sort(tracks)

    return(tracks)

}

# Save out track via rtracklayer

saveTrack <- function(sample, tracks, fileExt = ".BigWig", location = "./"){

  library(rtracklayer)

  track <- tracks[sample][[1]]

  rtracklayer::export(track, paste0(location, sample, fileExt))  

}

main <- function(input, output, params, log) {

    # Log

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script

    library(minfi)
    library(DMRcate)
    library(rtracklayer)

    filter <- readRDS(input$rds)

    # params
    anno <- params$anno # "hg38", "hg19"
    array <- params$array # "EPIC", "HM450"
    combine <- params$combine  # "mean", "median"
    by <- params$by # "sample"

    # output 
    save <- output$rds

    # run annotation
    tracks = getTrackObj(filter, anno, array, combine, by)

    # save output
    # Bigwig
    lapply(names(tracks), saveTrack, track = tracks, fileExt = ".BigWig", location = params$bwLocation )

    # Bedgraph
    lapply(names(tracks), saveTrack, track = tracks, fileExt = ".bedGraph", location = params$bwLocation)

    # save out list of GRanges 
    saveRDS(tracks, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)
ShowHide 34 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/zifornd/memeth
Name: memeth
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...