Workflow Steps and Code Snippets
530 tagged steps and code snippets that match keyword dplyr
WES analysis pipeline
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | Sys.setenv("TZDIR"=paste0(Sys.getenv("CONDA_PREFIX"), "/share/zoneinfo")) library(readr) library(dplyr) library(ggplot2) df <- read_csv(snakemake@input[[1]]) df <- df %>% mutate(sample_type = factor(sample_type, levels = c("normal", "tumor"))) p <- df %>% ggplot(aes(sample_type, mean)) + geom_boxplot(aes(fill = sample_type)) + labs(x = "", y = "Average Depth") + guides(fill = guide_legend(title="")) + ggtitle("Sequencing Depths") ggsave(snakemake@output[[1]], p) |
Finding cryptic relationships to boost disease gene detection (v0.3.0dev1-1)
13 14 15 | library(ggplot2) library(dplyr) library(tribes.tools) |
Processing sorted cells from BC1 that have been bulkrnaseq'd
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") library(DESeq2) library(IHW) library(ggplot2) library(dplyr) library(stringr) if (snakemake@threads > 1) { library("BiocParallel") parallel <- TRUE register(MulticoreParam(snakemake@threads)) } else { parallel <- FALSE } t2g_fp <- snakemake@input[[2]] t2g <- readr::read_table2(t2g_fp, col_names = c("tx", "ensgene", "symbol")) t2g <- t2g %>% dplyr::distinct(ensgene, symbol) dds <- readRDS(snakemake@input[[1]]) print(dds) # Grab last variable at end of formula for contrasts design_formula <- snakemake@params[['formula']] s <- str_remove_all(design_formula, " |~") s <- str_split(s, "\\+") vars <- s[[1]] var <- vars[length(vars)] print(snakemake@params[['contrast']]) print("Creating results for the following variable:") print(var) print(resultsNames(dds)) contrast_coef <- paste(c(var, snakemake@params[['contrast']][1], "vs", snakemake@params[['contrast']][2]), collapse="_") de_contrast <- c(var, snakemake@params[['contrast']][1], snakemake@params[['contrast']][2]) mle_res <- results(dds, contrast=de_contrast, filterFun=ihw, alpha = .05, parallel = parallel) map_res <- lfcShrink(dds, coef=contrast_coef, type = "apeglm", parallel = parallel) print("MLE LFC:") print(mle_res) print(summary(mle_res)) print("MAP LFC:") print(map_res) print(summary(map_res)) mle_df <- mle_res %>% data.frame() %>% tibble::rownames_to_column(var = "ensgene") %>% as_tibble() %>% left_join(t2g) %>% dplyr::select(symbol, ensgene, everything()) %>% dplyr::arrange(padj) %>% dplyr::distinct(symbol, .keep_all = TRUE) map_df <- map_res %>% data.frame() %>% tibble::rownames_to_column(var = "ensgene") %>% as_tibble() %>% left_join(t2g) %>% dplyr::select(symbol, ensgene, everything()) %>% dplyr::arrange(padj) %>% dplyr::distinct(symbol, .keep_all = TRUE) print("MLE dataframe") print(mle_df) print("MAP dataframe") print(map_df) mleplot <- mle_df %>% dplyr::mutate( significant = ifelse(padj < .05, "padj < .05", "padj >= .05"), direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated") ) %>% ggplot(aes(log10(baseMean),log2FoldChange)) + geom_point(aes(color = significant, shape = direction)) + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + labs(x = "log10 Expression", y = "MLE Log2FoldChange") + theme_bw() mapplot <- map_df %>% dplyr::mutate( significant = ifelse(padj < .05, "padj < .05", "padj >= .05"), direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated") ) %>% ggplot(aes(log10(baseMean),log2FoldChange)) + geom_point(aes(color = significant, shape = direction)) + geom_hline(yintercept = 0, linetype = "dashed", color = "red") + labs(x = "log10 Expression", y = "MAP Log2FoldChange") + theme_bw() readr::write_csv(mle_df, snakemake@output[['mleres']]) readr::write_csv(map_df, snakemake@output[['mapres']]) ggsave(snakemake@output[['mlema']], plot = mleplot, width = 10, height = 7) ggsave(snakemake@output[['mapma']], plot = mapplot, width = 10, height = 7) |
Repository to process ITS amplicon sequencing data and produce figures for Schneider et al (2021)
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 | args = commandArgs(trailingOnly=TRUE) itsx_in <- snakemake@input$its_in itsx_out <- snakemake@input$its_out sq.nc <- snakemake@input$seqtab out <- snakemake@output$seqtab out_mock <- snakemake@output$mock threads <- snakemake@threads mock <- snakemake@params$mock mock_samples <- strsplit(mock, ",")[[1]] library(dada2); packageVersion("dada2") library(dplyr) library(Biostrings) dna <- readDNAStringSet(itsx_in) dna_itsx <- as.character(readDNAStringSet(itsx_out)) seqtab.nc <- readRDS(sq.nc) colnames(seqtab.nc) <- names(dna) seqtab.nc <- seqtab.nc[,colnames(seqtab.nc)%in%names(dna_itsx)] colnames(seqtab.nc) <- dna_itsx seqtab.nc <- t(seqtab.nc) #Remove sequences below 50bp if (any(nchar(getSequences(t(seqtab.nc)))<50)) { dna_itsx <- dna_itsx[!(nchar(dna_itsx)<50)] seqtab.nc <- seqtab.nc[nchar(rownames(seqtab.nc))>50,] } #At this point we can summarise all identical sequences seqtab.nc2 <- cbind.data.frame(sequence=rownames(seqtab.nc), seqtab.nc) seqtab.nc3 <- group_by(seqtab.nc2, sequence) %>% summarise_each(funs(sum)) if (length(intersect(colnames(seqtab.nc3), mock_samples)) > 0) { mock_cols <- append(c("sequence"), intersect(colnames(seqtab.nc3), mock_samples)) mock <- seqtab.nc3 %>% select(all_of(mock_cols)) seqtab.nc4 <- seqtab.nc3 %>% select(!any_of(mock_cols)) saveRDS(mock, out_mock) } else { seqtab.nc4 <- seqtab.nc3[,colnames(seqtab.nc3)!="sequence"] } rownames(seqtab.nc4) <- seqtab.nc3$sequence seqtab.nc4 <- data.matrix(t(seqtab.nc4)) saveRDS(seqtab.nc4, out) |
Workflow for transcript expression analysis.
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | library(gage) library(gageData) library(pathview) library(dplyr) #p_all <- read.table("../sleuth/p-values_all_transcripts.csv", header=TRUE) #matr <- read.table("../sleuth/sleuth_matrix.csv", header=TRUE) #samples <- read.table("../table_for_reads.csv", header=TRUE, stringsAsFactors = TRUE) p_all <- read.table(snakemake@input[["pval"]], header=TRUE) matr <- read.table(snakemake@input[["matrix"]], header=TRUE) samples <- read.table(snakemake@input[["samples"]], header=TRUE, stringsAsFactors = TRUE) #p_all ist nach pval sortiert, wird nun wie Matrix nach Gen-ID angeordnet: p_all <- dplyr::arrange(p_all, target_id) if(any(p_all$target_id != row.names(matr))){ stop("Die Datenmatrix mit der Anzahl der Counts und der Datensatz der Signifikanzanalyse aus Sluth sind verschieden!") quit(status = 1, runLast = FALSE) } #Ersetzen der target_id durch Gen-Namen rownames(matr) = make.names(p_all$ext_gene, unique = TRUE) condition_1 <- samples$sample[samples$condition == as.character(factor(samples$condition)[1])] condition_2 <- samples$sample[samples$condition == as.character(factor(samples$condition)[2])] samples.cond_1 <- matr[][as.character(samples$sample[condition_1])] samples.cond_2 <- matr[][as.character(samples$sample[condition_2])] #log2-FoldChange FoldChange <- rowSums(samples.cond_2)/rowSums(samples.cond_1) log2FC <- log2(FoldChange) p_val <- p_all$pval q_val <- p_all$qval #Anlegen des Dataframes fuer den Gage mit: #Gen/Target-ID, #FoldChange(log2), #p-Werten aus der Sleuth-Analyse und #den durch Post-Hoc-Tests normaliesierten p-Werten (qval, also Korrektur der #Alphafehler-Kumulierung beim multiplen Testen) aus der Sleuth-Analyse gage.data <- data.frame(TragetID = p_all$target_id, EnsembleID = p_all$ens_gene, GeneID = p_all$ext_gene, log2FoldChange = log2FC, pVal = p_val, PostHoc_pValues = q_val, Mean = p_all$mean_obs) gage.data <- na.omit(gage.data) detach("package:dplyr", unload=TRUE) data(kegg.sets.hs) data(sigmet.idx.hs) kegg.sets.hs <- kegg.sets.hs[sigmet.idx.hs] # Get the results keggres <- gage(gage.data$log2FoldChange, gsets=kegg.sets.hs, same.dir=TRUE) # Look at both up (greater), down (less), and statatistics. lapply(keggres, head) # Get the pathways keggrespathways <- data.frame(id=rownames(keggres$greater), keggres$greater) %>% tbl_df() %>% filter(row_number()<=10) %>% .$id %>% as.character() keggrespathways # Get the IDs. keggresids <- substr(keggrespathways, start=1, stop=8) keggresids # Define plotting function for applying later plot_pathway = function(pid) pathview(gene.data=gage.data$log2FoldChange, pathway.id=pid, species="hsa", new.signature=FALSE) #plot multiple pathways (plots saved to disk and returns a throwaway list object) tmp = sapply(keggresids, function(pid) pathview(gene.data=gage.data$log2FoldChange, pathway.id=pid, species="hsa")) |
Automatic identification, classification and annotation of plant lncRNAs in transcriptomic sequences
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | library(ggpubr) library(dplyr) library(purrr) library(BioQC) library(ggplot2) library(data.table) args <- commandArgs(TRUE) entrop <- function(a) { ent <- entropySpecificity(a, norm=TRUE) return(ent) } trans <- function(a) { a$target_id <- NULL a$type <- NULL a$sum <- NULL entrop(a) } type <- function(a) { col_name <- colnames(a) col_name <- col_name[! col_name %in% c("type", "target_id")] a$sum <- rowSums(a[,col_name], na.rm=TRUE) a <- filter(a, sum >0) ent <- trans(a) a$entropy <- ent return(a) } plot <- function(a,b,c) { ggplot() + geom_density(data = a, aes(x = entropy, fill="Non"), alpha=0.4)+ geom_density(data = b, aes(x = entropy, fill="Cons"), alpha=0.4)+ geom_density(data = c, aes(x = entropy, fill="mRNA"), alpha=0.4)+ scale_colour_manual("", breaks = c("Non", "Cons", "mRNA")) + xlab('entropy') + ylab('density') + guides(fill = guide_legend(title = "Types")) + theme(text = element_text(size = 20)) ggsave("data/output/expression/entropy.png", width = 20, height = 20) } log <- function(a) { col_name <- colnames(a) col_name <- col_name[! col_name %in% c("type", "target_id")] col <- c() for (x in col_name) { col <- append(col, paste("log_", noquote(x), sep = "")) a[, paste("log_", noquote(x), sep = "")] <- log2(a[, noquote(x)]) } new_col <<- col return(a) } # install dataframe df <- read.table(args[1], header = TRUE) # log of columns df_log <- log(df) # exprassin analysis df_log[df_log == -Inf] <- 0 my_comparisons = list(c("consv", "non_cons"), c("consv", "prot"), c("prot", "non_cons")) ggboxplot(df_log, x="type", y= new_col , merge = "flip", ylab = "log2(TPM)", color = "type", palette = "jco")+ font("subtitle", size = 20)+ font("caption", size = 20)+ font("xlab", size = 20)+ font("ylab", size = 20)+ font("xy.text", size = 20) +theme(legend.text=element_text(size=20)) ggsave("data/output/expression/box_plot.png", width = 20, height = 20) ggdensity(df_log, x = new_col , y = "..density..", facet.by = "type", merge = TRUE, xlab = "TPM", rug = TRUE, add = "median", palette = "jco") + font("subtitle", size = 20)+ font("caption", size = 20)+ font("xlab", size = 20)+ font("ylab", size = 20)+ font("xy.text", size = 20) + theme(legend.text=element_text(size=20)) ggsave("data/output/expression/hist_expr.png", width = 20, height = 20) # entropy analysis non <-df[grepl("noncons", df$type),] cons <-df[grepl("consv", df$type),] mRNA <-df[grepl("mRNA", df$type),] non_ent <- type(non) cons_ent <- type(cons) mRNA_ent <- type(mRNA) plot(non_ent, cons_ent, mRNA_ent) |
Metagenomic pipeline managed by snakemake (1.0)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | args <- commandArgs(trailingOnly = TRUE) library(dplyr) library(tidyr) library(data.table) fixCollapsed <- function(df){ colnames(df) <- c("key", "value") df <- df %>% mutate(key = strsplit(key, "; ")) %>% unnest(key) df <- df[, c(2, 1)] return(df) } fixDuplicated <- function(df){ df <- df %>% group_by(key) %>% summarise(value = paste(value, collapse = "; ")) values <- strsplit(df$value, "; ") values <- lapply(values, unique) values <- sapply(values, paste, collapse = "; ") df$value <- values return(df) } removeUnknown <- function(df){ idx <- grepl("^-", df$key) df <- df[!idx,] return(df) } df <- fread(args[2], stringsAsFactors = FALSE, head = FALSE, nThread = as.integer(args[4])) df <- as.data.frame(df) df %>% fixCollapsed() %>% fixDuplicated() %>% removeUnknown() %>% fwrite(file = args[1], sep = "\t", nThread = as.integer(args[4])) df <- fread(args[3], stringsAsFactors = FALSE, head = FALSE, nThread = as.integer(args[4])) df <- as.data.frame(df) df %>% fixCollapsed() %>% fixDuplicated() %>% removeUnknown() %>% fwrite(file = args[1], sep = "\t", append = TRUE, nThread = as.integer(args[4])) |
Use an ensemble of variant callers to call variants from ATAC-seq data (v0.3.3)
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | args <- commandArgs(trailingOnly = TRUE) test.data<- args[1] model <- args[2] output<- args[3] # load libraries library(data.table) library(plyr) library(dplyr) suppressMessages(library(mlr)) # load model print("loading appropriate model") load(model) # load test print("loading and formatting test data") test<- read.table(test.data, header=TRUE, sep="\t", na.strings=c("NA",".","na","N/A"), skipNul=FALSE, row.names=NULL) # making predictions print("making predictions and outputting results") pred= predict(fit, newdata= test, type="prob") write.table(pred$data, sep='\t', quote=FALSE, row.names=FALSE, na=".", output) |
12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | args <- commandArgs(trailingOnly = TRUE) training<- args[1] balance<- args[2] # an integer (0 or 1) indicating whether to balance the data model<- args[3] importance<- args[4] # importance for each of the variables is saved here tune<- args[5] # if specified, the results of cross validation are saved here # load libraries library(plyr) library(dplyr) suppressMessages(library(mlr)) library(parallelMap) library(parallel) # load data.frame print("loading training data into R") training<- read.table(training, header=TRUE, sep="\t", na.strings=c("NA",".","na","N/A","inf","Inf","infinity","Infinity"), skipNul=T, row.names=NULL) nrows_total <- nrow(training) training <- na.omit(training) if (nrows_total-nrow(training)>0) { print(paste("ignoring", nrows_total-nrow(training), "rows that have NA values")) } print("creating training task and making RF learner") # optimize hyper parameters # make training task traintask <- makeClassifTask(data = training, target = colnames(training)[ncol(training)], positive = 1) # create learner rf.lrn <- makeLearner("classif.ranger", predict.type = "prob") if (as.integer(balance)) { print("calculating class weights in order to ensure data is balanced when sampled") # first, retrieve the inverse of the counts of each of the labels w <- 1/table(training[,ncol(training)]) # calculate probabilities for each label w <- w/sum(w) # create a vector containing weights instead of labels weights <- rep(0, nrow(training)) for (val in names(w)) { weights[training[,ncol(training)] == val] <- w[val] } # create par.vals rf.lrn$par.vals <- list(importance='impurity', verbose=TRUE, case.weights=weights) } else { # create par.vals rf.lrn$par.vals <- list(importance='impurity', verbose=TRUE) } if (!is.na(tune)) { # mtry default: sqrt(number of features) # nodesize default: 1 params <- makeParamSet(makeIntegerParam("mtry",lower = 1,upper = 10), makeIntegerParam("min.node.size",lower = 7,upper = 25)) # set validation strategy; 4-fold cross validation rdesc <- makeResampleDesc("CV",iters=5L) # set optimization technique ctrl <- makeTuneControlGrid(resolution=c(mtry=10, min.node.size=19)) # tune hyperparameters print("initiating multicore tuning of hyperparameters") # but run the hyperparameter tuning in parallel, since it'll take a while # number of cores should be detected automatically (but don't use # all of the cores because otherwise we'll use too much memory!) parallelStartSocket(cpus=trunc(detectCores()/12), level="mlr.tuneParams") parallelLibrary("mlr") # create a custom F beta measure fbeta = makeMeasure(id = "fbeta", minimize = FALSE, best = 1, worst = 0, properties = c("classif", "req.pred", "req.truth"), name = "Fbeta measure", note = "Defined as: (1+beta^2) * tp/ (beta^2 * sum(truth == positive) + sum(response == positive))", fun = function(task, model, pred, feats, extra.args) { beta = 0.5 beta = beta^2 truth = pred$data$truth response = pred$data$response positive = pred$task.desc$positive (1+beta) * measureTP(truth, response, positive) / (beta * sum(truth == positive) + sum(response == positive)) } ) tuned = tuneParams(learner=rf.lrn, task=traintask, resampling=rdesc, measures=list(fbeta), par.set=params, control=ctrl, show.info=T) parallelStop() print("matrix of classifier performance for each pair of hyperparams") data = generateHyperParsEffectData(tuned) print(data$data) write.table(data$data, sep="\t", file=tune, quote=FALSE, row.names=F) print("tuned params are") print(tuned$x) rf.lrn$par.vals = c(rf.lrn$par.vals, tuned$x) } else { # set default hyperparameter values rf.lrn$par.vals = c(rf.lrn$par.vals, mtry=9, min.node.size=16) } print("training model") fit = mlr::train(rf.lrn, traintask) # print out variable importance print("recording variable importance:") importance_df = as.data.frame(sort(fit$learner.model$variable.importance, decreasing=TRUE)) print(importance_df) names(importance_df) <- c("variable\timportance") write.table(importance_df, sep="\t", file=importance, quote=FALSE) # save.data save.image( model ) |
Improving pathogenicity prediction of missense variants by using AlphaFold-derived features (0.9)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | library(readr) library(dplyr) library(optparse) library(stringr) source("workflow/scripts/existing_scores_glm_functions.R") set.seed(1) option_list = list( make_option(c("-t", "--training_dataset"), type="character", default="results/prediction_final/pre_final_model_regular_variants.csv.gz", help="File containing the training dataset"), make_option(c("-c", "--combined_model"), type="character", default="results/prediction_final/combined_model.RData", help="File where the final combined logistic regression model should be saved to"), make_option(c("-i", "--training_var_ids"), type="character", default="results/prediction_final/training_var_ids.tsv", help="IDs of variants that were in the training / test set") ) opt = parse_args(OptionParser(option_list=option_list)) variants<-read_csv(opt$training_dataset, na=c(".","NA", NA)) variants$AlphScore<-variants$predicted_Alph index_col<-get_index_col(variants) variants$DEOGEN2_score_med<- unlist_score(variants$DEOGEN2_score, index_col) gnomad_vars<-variants %>% filter(gnomadSet==1) clinvar_vars_not_gnomad <-variants%>% filter(gnomadSet==0) %>% filter(! var_id_genomic %in% gnomad_vars$var_id_genomic ) set_of_models<-fit_set_of_models(clinvar_vars_not_gnomad) gnomad_set_position<-gnomad_vars$var_id_genomic clinvar_set_position<-clinvar_vars_not_gnomad$var_id_genomic training_vars<-rbind( tibble(var_id_genomic=gnomad_set_position, gnomadSet=1), tibble(var_id_genomic=clinvar_set_position, gnomadSet=0) ) # write to disk write_tsv(x=training_vars, file=opt$training_var_ids) write_rds(x=set_of_models, file=opt$combined_model) |
tool / cran
dplyr
A Grammar of Data Manipulation: A fast, consistent tool for working with data frame like objects, both in memory and out of memory.