Workflow Steps and Code Snippets

530 tagged steps and code snippets that match keyword dplyr

WES analysis pipeline

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
Sys.setenv("TZDIR"=paste0(Sys.getenv("CONDA_PREFIX"), "/share/zoneinfo"))

library(readr)
library(dplyr)
library(ggplot2)

df <- read_csv(snakemake@input[[1]])
df <- df %>%
    mutate(sample_type = factor(sample_type, levels = c("normal", "tumor")))
p <- df %>%
    ggplot(aes(sample_type, mean)) +
    geom_boxplot(aes(fill = sample_type)) +
    labs(x = "", y = "Average Depth") +
    guides(fill = guide_legend(title="")) +
    ggtitle("Sequencing Depths")
ggsave(snakemake@output[[1]], p)

Processing sorted cells from BC1 that have been bulkrnaseq'd

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(DESeq2)
library(IHW)
library(ggplot2)
library(dplyr)
library(stringr)

if (snakemake@threads > 1) {
    library("BiocParallel")
    parallel <- TRUE
    register(MulticoreParam(snakemake@threads))
} else {
    parallel <- FALSE
}

t2g_fp <- snakemake@input[[2]]
t2g <- readr::read_table2(t2g_fp, col_names = c("tx", "ensgene", "symbol"))
t2g <- t2g %>%
    dplyr::distinct(ensgene, symbol)

dds <- readRDS(snakemake@input[[1]])

print(dds)

# Grab last variable at end of formula for contrasts
design_formula <- snakemake@params[['formula']]
s <- str_remove_all(design_formula, " |~")
s <- str_split(s, "\\+")
vars <- s[[1]]
var <- vars[length(vars)]

print(snakemake@params[['contrast']])
print("Creating results for the following variable:")
print(var)

print(resultsNames(dds))

contrast_coef <- paste(c(var, snakemake@params[['contrast']][1], "vs", snakemake@params[['contrast']][2]), collapse="_")
de_contrast <- c(var, snakemake@params[['contrast']][1], snakemake@params[['contrast']][2])

mle_res <- results(dds, contrast=de_contrast, filterFun=ihw, alpha = .05, parallel = parallel)
map_res <- lfcShrink(dds, coef=contrast_coef, type = "apeglm", parallel = parallel)

print("MLE LFC:")
print(mle_res)
print(summary(mle_res))

print("MAP LFC:")
print(map_res)
print(summary(map_res))

mle_df <- mle_res %>%
  data.frame() %>%
  tibble::rownames_to_column(var = "ensgene") %>%
  as_tibble() %>%
  left_join(t2g) %>%
  dplyr::select(symbol, ensgene, everything()) %>%
  dplyr::arrange(padj) %>%
  dplyr::distinct(symbol, .keep_all = TRUE)

map_df <- map_res %>%
    data.frame() %>%
    tibble::rownames_to_column(var = "ensgene") %>%
    as_tibble() %>%
    left_join(t2g) %>%
    dplyr::select(symbol, ensgene, everything()) %>%
    dplyr::arrange(padj) %>%
    dplyr::distinct(symbol, .keep_all = TRUE)

print("MLE dataframe")
print(mle_df)
print("MAP dataframe")
print(map_df)


mleplot <- mle_df %>%
    dplyr::mutate(
        significant = ifelse(padj < .05, "padj < .05", "padj >= .05"),
        direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated")
    ) %>%
    ggplot(aes(log10(baseMean),log2FoldChange)) +
    geom_point(aes(color = significant, shape = direction)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    labs(x = "log10 Expression", y = "MLE Log2FoldChange") +
    theme_bw()

mapplot <- map_df %>%
    dplyr::mutate(
        significant = ifelse(padj < .05, "padj < .05", "padj >= .05"),
        direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated")
    ) %>%
    ggplot(aes(log10(baseMean),log2FoldChange)) +
    geom_point(aes(color = significant, shape = direction)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    labs(x = "log10 Expression", y = "MAP Log2FoldChange") +
    theme_bw()

readr::write_csv(mle_df, snakemake@output[['mleres']])
readr::write_csv(map_df, snakemake@output[['mapres']])
ggsave(snakemake@output[['mlema']], plot = mleplot, width = 10, height = 7)
ggsave(snakemake@output[['mapma']], plot = mapplot, width = 10, height = 7)

Repository to process ITS amplicon sequencing data and produce figures for Schneider et al (2021)

 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
args = commandArgs(trailingOnly=TRUE)

itsx_in <- snakemake@input$its_in
itsx_out <- snakemake@input$its_out
sq.nc <- snakemake@input$seqtab
out <- snakemake@output$seqtab
out_mock <- snakemake@output$mock
threads <- snakemake@threads
mock <- snakemake@params$mock

mock_samples <- strsplit(mock, ",")[[1]]

library(dada2); packageVersion("dada2")
library(dplyr)
library(Biostrings)

dna <- readDNAStringSet(itsx_in)
dna_itsx <- as.character(readDNAStringSet(itsx_out))
seqtab.nc <- readRDS(sq.nc)
colnames(seqtab.nc) <- names(dna)
seqtab.nc <- seqtab.nc[,colnames(seqtab.nc)%in%names(dna_itsx)]

colnames(seqtab.nc) <- dna_itsx
seqtab.nc <- t(seqtab.nc)

#Remove sequences below 50bp
if (any(nchar(getSequences(t(seqtab.nc)))<50)) {
  dna_itsx <- dna_itsx[!(nchar(dna_itsx)<50)]
  seqtab.nc <- seqtab.nc[nchar(rownames(seqtab.nc))>50,]
}


#At this point we can summarise all identical sequences 
seqtab.nc2 <- cbind.data.frame(sequence=rownames(seqtab.nc), seqtab.nc)
seqtab.nc3 <- group_by(seqtab.nc2, sequence) %>% 
  summarise_each(funs(sum))

if (length(intersect(colnames(seqtab.nc3), mock_samples)) > 0) {
  mock_cols <- append(c("sequence"), intersect(colnames(seqtab.nc3), mock_samples))
  mock <- seqtab.nc3 %>% select(all_of(mock_cols))
  seqtab.nc4 <- seqtab.nc3 %>% select(!any_of(mock_cols))
  saveRDS(mock, out_mock)
} else {
  seqtab.nc4 <- seqtab.nc3[,colnames(seqtab.nc3)!="sequence"]
}

rownames(seqtab.nc4) <- seqtab.nc3$sequence
seqtab.nc4 <- data.matrix(t(seqtab.nc4))

saveRDS(seqtab.nc4, out)

Workflow for transcript expression analysis.

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
library(gage)
library(gageData)
library(pathview)
library(dplyr)

#p_all <- read.table("../sleuth/p-values_all_transcripts.csv", header=TRUE)
#matr <- read.table("../sleuth/sleuth_matrix.csv", header=TRUE)
#samples <- read.table("../table_for_reads.csv", header=TRUE, stringsAsFactors = TRUE)

p_all <- read.table(snakemake@input[["pval"]], header=TRUE)
matr <- read.table(snakemake@input[["matrix"]], header=TRUE)
samples <- read.table(snakemake@input[["samples"]], header=TRUE, stringsAsFactors = TRUE)

#p_all ist nach pval sortiert, wird nun wie Matrix nach Gen-ID angeordnet:
p_all <- dplyr::arrange(p_all, target_id)

if(any(p_all$target_id != row.names(matr))){
  stop("Die Datenmatrix mit der Anzahl der Counts und der Datensatz 
       der Signifikanzanalyse aus Sluth sind verschieden!")
  quit(status = 1, runLast = FALSE)

}
#Ersetzen der target_id durch Gen-Namen
rownames(matr) = make.names(p_all$ext_gene, unique = TRUE)

condition_1 <- samples$sample[samples$condition == as.character(factor(samples$condition)[1])]
condition_2 <- samples$sample[samples$condition == as.character(factor(samples$condition)[2])]

samples.cond_1 <- matr[][as.character(samples$sample[condition_1])]
samples.cond_2 <- matr[][as.character(samples$sample[condition_2])]

#log2-FoldChange   
FoldChange <- rowSums(samples.cond_2)/rowSums(samples.cond_1)
log2FC <- log2(FoldChange)

p_val <- p_all$pval
q_val <- p_all$qval

#Anlegen des Dataframes fuer den Gage mit:
#Gen/Target-ID, 
#FoldChange(log2), 
#p-Werten aus der Sleuth-Analyse und 
#den durch Post-Hoc-Tests normaliesierten p-Werten (qval, also Korrektur der 
#Alphafehler-Kumulierung beim multiplen Testen) aus der Sleuth-Analyse

gage.data <- data.frame(TragetID = p_all$target_id, EnsembleID = p_all$ens_gene, 
                        GeneID = p_all$ext_gene, log2FoldChange = log2FC, 
                        pVal = p_val, PostHoc_pValues = q_val, Mean = p_all$mean_obs)
gage.data <- na.omit(gage.data)

detach("package:dplyr", unload=TRUE)

data(kegg.sets.hs)
data(sigmet.idx.hs)
kegg.sets.hs <- kegg.sets.hs[sigmet.idx.hs]

# Get the results
keggres <- gage(gage.data$log2FoldChange, gsets=kegg.sets.hs, same.dir=TRUE)
# Look at both up (greater), down (less), and statatistics.
lapply(keggres, head)

# Get the pathways
keggrespathways <- data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tbl_df() %>%
  filter(row_number()<=10) %>%
  .$id %>%
  as.character()
keggrespathways

# Get the IDs.
keggresids <- substr(keggrespathways, start=1, stop=8)
keggresids

# Define plotting function for applying later
plot_pathway = function(pid) pathview(gene.data=gage.data$log2FoldChange, pathway.id=pid, 
                                      species="hsa", new.signature=FALSE)

#plot multiple pathways (plots saved to disk and returns a throwaway list object)
tmp = sapply(keggresids, function(pid) pathview(gene.data=gage.data$log2FoldChange, 
                                         pathway.id=pid, species="hsa"))

Automatic identification, classification and annotation of plant lncRNAs in transcriptomic sequences

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
library(ggpubr)
library(dplyr)
library(purrr)
library(BioQC)
library(ggplot2)
library(data.table)

args <- commandArgs(TRUE)

entrop <- function(a) {
  ent <- entropySpecificity(a, norm=TRUE)
  return(ent)
}

trans <- function(a) {
  a$target_id <- NULL
  a$type <- NULL
  a$sum <- NULL
  entrop(a)
}

type <- function(a) {
  col_name <- colnames(a)
  col_name <- col_name[! col_name %in% c("type", "target_id")]
  a$sum <-  rowSums(a[,col_name], na.rm=TRUE)
  a <- filter(a, sum >0)
  ent <- trans(a)
  a$entropy <- ent
  return(a)
}

plot <- function(a,b,c) {
  ggplot() +
    geom_density(data = a, aes(x = entropy, fill="Non"), alpha=0.4)+
    geom_density(data = b, aes(x = entropy, fill="Cons"), alpha=0.4)+
    geom_density(data = c, aes(x = entropy, fill="mRNA"), alpha=0.4)+
    scale_colour_manual("", 
                        breaks = c("Non", "Cons", "mRNA")) +
    xlab('entropy') + ylab('density') + guides(fill = guide_legend(title = "Types")) + 
    theme(text = element_text(size = 20))
  ggsave("data/output/expression/entropy.png", width = 20, height = 20)
}

log <- function(a) {
       col_name <- colnames(a)
       col_name <- col_name[! col_name %in% c("type", "target_id")]  
       col <- c()
       for (x in col_name) {
            col <- append(col, paste("log_", noquote(x), sep = ""))
            a[, paste("log_", noquote(x), sep = "")] <- log2(a[, noquote(x)])
       } 
       new_col <<- col
       return(a)
}

# install dataframe
df <- read.table(args[1], header = TRUE)

# log of columns
df_log <- log(df) 

# exprassin analysis
df_log[df_log == -Inf] <- 0
my_comparisons = list(c("consv", "non_cons"), c("consv", "prot"),  c("prot", "non_cons"))

ggboxplot(df_log, x="type", y= new_col , merge = "flip", ylab = "log2(TPM)", color = "type", palette = "jco")+  
  font("subtitle", size = 20)+
  font("caption", size = 20)+
  font("xlab", size = 20)+
  font("ylab", size = 20)+
  font("xy.text", size = 20) +theme(legend.text=element_text(size=20))
ggsave("data/output/expression/box_plot.png", width = 20, height = 20)

ggdensity(df_log, x = new_col , y = "..density..", facet.by = "type", merge = TRUE, xlab = "TPM", rug = TRUE, add = "median", palette = "jco") +  font("subtitle", size = 20)+
  font("caption", size = 20)+
  font("xlab", size = 20)+
  font("ylab", size = 20)+
  font("xy.text", size = 20) + theme(legend.text=element_text(size=20))
ggsave("data/output/expression/hist_expr.png", width = 20, height = 20)

# entropy analysis
non <-df[grepl("noncons", df$type),]
cons <-df[grepl("consv", df$type),]
mRNA <-df[grepl("mRNA", df$type),]

non_ent <- type(non)
cons_ent <- type(cons)
mRNA_ent <- type(mRNA)

plot(non_ent, cons_ent, mRNA_ent)

Metagenomic pipeline managed by snakemake (1.0)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
args <- commandArgs(trailingOnly = TRUE)

library(dplyr)
library(tidyr)
library(data.table)
fixCollapsed <- function(df){
    colnames(df) <- c("key", "value")
    df <- df %>%
        mutate(key = strsplit(key, "; ")) %>%
        unnest(key)
    df <- df[, c(2, 1)]
    return(df)
}
fixDuplicated <- function(df){
    df <- df %>%
        group_by(key) %>%
        summarise(value = paste(value, collapse = "; "))
    values <- strsplit(df$value, "; ")
    values <- lapply(values, unique)
    values <- sapply(values, paste, collapse = "; ")
    df$value <- values
    return(df)
}
removeUnknown <- function(df){
    idx <- grepl("^-", df$key)
    df <- df[!idx,]
    return(df)
}
df <- fread(args[2], stringsAsFactors = FALSE,
    head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
    fixCollapsed() %>%
    fixDuplicated() %>%
    removeUnknown() %>%
    fwrite(file = args[1], sep = "\t",
        nThread = as.integer(args[4]))
df <- fread(args[3], stringsAsFactors = FALSE,
    head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
    fixCollapsed() %>%
    fixDuplicated() %>%
    removeUnknown() %>%
    fwrite(file = args[1], sep = "\t",
        append = TRUE, nThread = as.integer(args[4]))

Use an ensemble of variant callers to call variants from ATAC-seq data (v0.3.3)

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
args <- commandArgs(trailingOnly = TRUE)
test.data<- args[1]
model <- args[2]
output<- args[3]

# load libraries
library(data.table)
library(plyr)
library(dplyr)
suppressMessages(library(mlr))

# load model
print("loading appropriate model")
load(model)

# load test
print("loading and formatting test data")
test<- read.table(test.data, header=TRUE, sep="\t", na.strings=c("NA",".","na","N/A"), skipNul=FALSE, row.names=NULL)

# making predictions
print("making predictions and outputting results")
pred= predict(fit, newdata= test, type="prob")
write.table(pred$data, sep='\t', quote=FALSE, row.names=FALSE, na=".", output)
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
args <- commandArgs(trailingOnly = TRUE)
training<- args[1]
balance<- args[2] # an integer (0 or 1) indicating whether to balance the data
model<- args[3]
importance<- args[4] # importance for each of the variables is saved here
tune<- args[5] # if specified, the results of cross validation are saved here

# load libraries
library(plyr)
library(dplyr)
suppressMessages(library(mlr))
library(parallelMap)
library(parallel)

# load data.frame
print("loading training data into R")
training<- read.table(training, header=TRUE, sep="\t", na.strings=c("NA",".","na","N/A","inf","Inf","infinity","Infinity"), skipNul=T, row.names=NULL)
nrows_total <- nrow(training)
training <- na.omit(training)
if (nrows_total-nrow(training)>0) {
	print(paste("ignoring", nrows_total-nrow(training), "rows that have NA values"))
}

print("creating training task and making RF learner")

# optimize hyper parameters
# make training task
traintask <- makeClassifTask(data = training, target = colnames(training)[ncol(training)], positive = 1)

# create learner
rf.lrn <- makeLearner("classif.ranger", predict.type = "prob")

if (as.integer(balance)) {
	print("calculating class weights in order to ensure data is balanced when sampled")
	# first, retrieve the inverse of the counts of each of the labels
	w <- 1/table(training[,ncol(training)])
	# calculate probabilities for each label
	w <- w/sum(w)
	# create a vector containing weights instead of labels
	weights <- rep(0, nrow(training))
	for (val in names(w)) {
		weights[training[,ncol(training)] == val] <- w[val]
	}

	# create par.vals
	rf.lrn$par.vals <- list(importance='impurity', verbose=TRUE, case.weights=weights)
} else {
	# create par.vals
	rf.lrn$par.vals <- list(importance='impurity', verbose=TRUE)
}

if (!is.na(tune)) {
	# mtry default: sqrt(number of features)
	# nodesize default: 1
	params <- makeParamSet(makeIntegerParam("mtry",lower = 1,upper = 10),
                           makeIntegerParam("min.node.size",lower = 7,upper = 25))
	# set validation strategy; 4-fold cross validation
	rdesc <- makeResampleDesc("CV",iters=5L)
	# set optimization technique
	ctrl <- makeTuneControlGrid(resolution=c(mtry=10, min.node.size=19))

	# tune hyperparameters
	print("initiating multicore tuning of hyperparameters")
	# but run the hyperparameter tuning in parallel, since it'll take a while
	# number of cores should be detected automatically (but don't use
	# all of the cores because otherwise we'll use too much memory!)
	parallelStartSocket(cpus=trunc(detectCores()/12), level="mlr.tuneParams")
	parallelLibrary("mlr")

	# create a custom F beta measure
	fbeta = makeMeasure(id = "fbeta", minimize = FALSE, best = 1, worst = 0,
		properties = c("classif", "req.pred", "req.truth"),
		name = "Fbeta measure",
		note = "Defined as: (1+beta^2) * tp/ (beta^2 * sum(truth == positive) + sum(response == positive))",
		fun = function(task, model, pred, feats, extra.args) {
			beta = 0.5
			beta = beta^2
			truth = pred$data$truth
			response = pred$data$response
			positive = pred$task.desc$positive
			(1+beta) * measureTP(truth, response, positive) /
				(beta * sum(truth == positive) + sum(response == positive))
		}
	)

	tuned = tuneParams(learner=rf.lrn, task=traintask, resampling=rdesc, measures=list(fbeta), par.set=params, control=ctrl, show.info=T)

	parallelStop()

	print("matrix of classifier performance for each pair of hyperparams")
	data = generateHyperParsEffectData(tuned)
	print(data$data)
	write.table(data$data, sep="\t", file=tune, quote=FALSE, row.names=F)
	print("tuned params are")
	print(tuned$x)
	rf.lrn$par.vals = c(rf.lrn$par.vals, tuned$x)
} else {
	# set default hyperparameter values
	rf.lrn$par.vals = c(rf.lrn$par.vals, mtry=9, min.node.size=16)
}
print("training model")
fit = mlr::train(rf.lrn, traintask)

# print out variable importance
print("recording variable importance:")
importance_df = as.data.frame(sort(fit$learner.model$variable.importance, decreasing=TRUE))
print(importance_df)
names(importance_df) <- c("variable\timportance")
write.table(importance_df, sep="\t", file=importance, quote=FALSE)

# save.data
save.image( model )

Improving pathogenicity prediction of missense variants by using AlphaFold-derived features (0.9)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
library(readr)
library(dplyr)
library(optparse)
library(stringr)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-t", "--training_dataset"), type="character", default="results/prediction_final/pre_final_model_regular_variants.csv.gz", 
              help="File containing the training dataset"),
  make_option(c("-c", "--combined_model"), type="character", default="results/prediction_final/combined_model.RData", 
              help="File where the final combined logistic regression model should be saved to"),
  make_option(c("-i", "--training_var_ids"), type="character", default="results/prediction_final/training_var_ids.tsv", 
              help="IDs of variants that were in the training / test set")
)

opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$training_dataset, na=c(".","NA", NA))
variants$AlphScore<-variants$predicted_Alph

index_col<-get_index_col(variants)

variants$DEOGEN2_score_med<- unlist_score(variants$DEOGEN2_score, index_col)

gnomad_vars<-variants %>% 
  filter(gnomadSet==1)

clinvar_vars_not_gnomad <-variants%>% 
  filter(gnomadSet==0) %>%
  filter(! var_id_genomic %in% gnomad_vars$var_id_genomic )

set_of_models<-fit_set_of_models(clinvar_vars_not_gnomad)

gnomad_set_position<-gnomad_vars$var_id_genomic
clinvar_set_position<-clinvar_vars_not_gnomad$var_id_genomic

training_vars<-rbind(
tibble(var_id_genomic=gnomad_set_position,
       gnomadSet=1),
tibble(var_id_genomic=clinvar_set_position,
       gnomadSet=0)
)
# write to disk
write_tsv(x=training_vars,
          file=opt$training_var_ids)

write_rds(x=set_of_models,
          file=opt$combined_model)
tool / cran

dplyr

A Grammar of Data Manipulation: A fast, consistent tool for working with data frame like objects, both in memory and out of memory.