Workflow Steps and Code Snippets

286 tagged steps and code snippets that match keyword data.table

QTL Snakemake Workflow

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
library(yaml)
library(data.table)
library(devtools)

install_github("bmansfeld/QTLseqr",upgrade_dependencies=  TRUE)
library("QTLseqr")
library("ggplot2")

config <- yaml.load_file("config.yaml")
QTL_Plotting <- function(HighBulk,LowBulk, No_of_Chromosomes){
  options(warn=-1)
  table_file <- 'QTL_VCF_to_Table/QTL_Table.csv'
  chrom_names<-as.list(unique(fread(table_file,sep = "\t", select = c("CHROM") ))) 
  #chrom_vcf_names<- as.character.Array(droplevels(scan_vcf$`*:*-*`$rowRanges@seqnames@values[1:No_of_Chromosomes]))
  df <- importFromTable(file = table_file,
                        highBulk = HighBulk,
                        lowBulk = LowBulk,
                        chromList = chrom_names$CHROM[1:No_of_Chromosomes],sep = '\t')

  ggplot(data = df)+geom_histogram(aes(x = DP.HIGH + DP.LOW))+xlim(0,1000)
  ggsave(filename="QTL_Plots/DP_Filtering Data",device= "pdf", width=20, height=10)
  ggplot(data = df)+geom_histogram(aes(x = REF_FRQ))
  ggsave(filename="QTL_Plots/REF Frequency for Filtering Data",device = "pdf", width=20, height=10)
  ggplot(data = df)+geom_histogram(aes(x = SNPindex.HIGH))
  ggsave(filename = "QTL_Plots/SNP Index for Filtering Data",device="pdf", width=20, height=10)


  df_filt <-
    filterSNPs(
      SNPset = df,
      refAlleleFreq = config$QTL_Parser$REF_Allel_Frequency,
      minTotalDepth = config$QTL_Parser$Min_Total_Depth,
      maxTotalDepth = config$QTL_Parser$Max_Total_Depth,
      depthDifference = config$QTL_Parser$Depth_Difference,
      minSampleDepth = config$QTL_Parser$Min_Sample_Depth,
      minGQ = config$QTL_Parser$Min_GQ,
      verbose = TRUE
    )

  df_filt <- runQTLseqAnalysis(df_filt,
                               windowSize = config$QTL_Parser$WindowSize,
                               popStruc = "F2",
                               bulkSize = c(config$QTL_Parser$High_Bulk_Size, config$QTL_Parser$Low_Bulk_Size),
                               replications = 10000,
                               intervals = c(95, 99))

  df_filt <- runGprimeAnalysis(df_filt,windowSize = config$QTL_Parser$WindowSize,outlierFilter = "deltaSNP",filterThreshold = config$QTL_Parser$Filter_Threshold)

  plotGprimeDist(SNPset = df_filt, outlierFilter = "Hampel")
  ggsave(filename = "QTL_Plots/GPrime Distribution with Hampel Outlier Filter",device = "pdf", width=20, height=10)

  plotGprimeDist(SNPset =df_filt, outlierFilter = "deltaSNP", filterThreshold = config$QTL_Parser$Filter_Threshold)
  ggsave(filename = "QTL_Plots/GPrime Distribution with deltaSNP Outlier Filter",device = "pdf", width=20, height=10)

  p1 <- plotQTLStats(SNPset = df_filt, var = "nSNPs")
  ggsave(filename = "QTL_Plots/SNP Density Plot",plot=p1,device = "pdf", width=20, height=10)

  p2 <- plotQTLStats(SNPset = df_filt, var = "deltaSNP", plotIntervals = TRUE)
  ggsave(filename = "QTL_Plots/Delta SNP Index Plot with Intervals",plot=p2,device = "pdf", width=20, height=10)
  p3 <- plotQTLStats(SNPset = df_filt, var = "Gprime", plotThreshold = TRUE, q = config$QTL_Parser$FDR_q)
  ggsave(filename = "QTL_Plots/GPrime Value Plot",plot=p3,device = "pdf", width=20, height=10)

}

QTL_Plotting(config$QTL_Parser$HighBulk,config$QTL_Parser$LowBulk,config$QTL_Parser$Number_of_Chromosomes)

Automatic identification, classification and annotation of plant lncRNAs in transcriptomic sequences

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
library(ggpubr)
library(dplyr)
library(purrr)
library(BioQC)
library(ggplot2)
library(data.table)

args <- commandArgs(TRUE)

entrop <- function(a) {
  ent <- entropySpecificity(a, norm=TRUE)
  return(ent)
}

trans <- function(a) {
  a$target_id <- NULL
  a$type <- NULL
  a$sum <- NULL
  entrop(a)
}

type <- function(a) {
  col_name <- colnames(a)
  col_name <- col_name[! col_name %in% c("type", "target_id")]
  a$sum <-  rowSums(a[,col_name], na.rm=TRUE)
  a <- filter(a, sum >0)
  ent <- trans(a)
  a$entropy <- ent
  return(a)
}

plot <- function(a,b,c) {
  ggplot() +
    geom_density(data = a, aes(x = entropy, fill="Non"), alpha=0.4)+
    geom_density(data = b, aes(x = entropy, fill="Cons"), alpha=0.4)+
    geom_density(data = c, aes(x = entropy, fill="mRNA"), alpha=0.4)+
    scale_colour_manual("", 
                        breaks = c("Non", "Cons", "mRNA")) +
    xlab('entropy') + ylab('density') + guides(fill = guide_legend(title = "Types")) + 
    theme(text = element_text(size = 20))
  ggsave("data/output/expression/entropy.png", width = 20, height = 20)
}

log <- function(a) {
       col_name <- colnames(a)
       col_name <- col_name[! col_name %in% c("type", "target_id")]  
       col <- c()
       for (x in col_name) {
            col <- append(col, paste("log_", noquote(x), sep = ""))
            a[, paste("log_", noquote(x), sep = "")] <- log2(a[, noquote(x)])
       } 
       new_col <<- col
       return(a)
}

# install dataframe
df <- read.table(args[1], header = TRUE)

# log of columns
df_log <- log(df) 

# exprassin analysis
df_log[df_log == -Inf] <- 0
my_comparisons = list(c("consv", "non_cons"), c("consv", "prot"),  c("prot", "non_cons"))

ggboxplot(df_log, x="type", y= new_col , merge = "flip", ylab = "log2(TPM)", color = "type", palette = "jco")+  
  font("subtitle", size = 20)+
  font("caption", size = 20)+
  font("xlab", size = 20)+
  font("ylab", size = 20)+
  font("xy.text", size = 20) +theme(legend.text=element_text(size=20))
ggsave("data/output/expression/box_plot.png", width = 20, height = 20)

ggdensity(df_log, x = new_col , y = "..density..", facet.by = "type", merge = TRUE, xlab = "TPM", rug = TRUE, add = "median", palette = "jco") +  font("subtitle", size = 20)+
  font("caption", size = 20)+
  font("xlab", size = 20)+
  font("ylab", size = 20)+
  font("xy.text", size = 20) + theme(legend.text=element_text(size=20))
ggsave("data/output/expression/hist_expr.png", width = 20, height = 20)

# entropy analysis
non <-df[grepl("noncons", df$type),]
cons <-df[grepl("consv", df$type),]
mRNA <-df[grepl("mRNA", df$type),]

non_ent <- type(non)
cons_ent <- type(cons)
mRNA_ent <- type(mRNA)

plot(non_ent, cons_ent, mRNA_ent)

Metagenomic pipeline managed by snakemake (1.0)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
args <- commandArgs(trailingOnly = TRUE)

library(dplyr)
library(tidyr)
library(data.table)
fixCollapsed <- function(df){
    colnames(df) <- c("key", "value")
    df <- df %>%
        mutate(key = strsplit(key, "; ")) %>%
        unnest(key)
    df <- df[, c(2, 1)]
    return(df)
}
fixDuplicated <- function(df){
    df <- df %>%
        group_by(key) %>%
        summarise(value = paste(value, collapse = "; "))
    values <- strsplit(df$value, "; ")
    values <- lapply(values, unique)
    values <- sapply(values, paste, collapse = "; ")
    df$value <- values
    return(df)
}
removeUnknown <- function(df){
    idx <- grepl("^-", df$key)
    df <- df[!idx,]
    return(df)
}
df <- fread(args[2], stringsAsFactors = FALSE,
    head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
    fixCollapsed() %>%
    fixDuplicated() %>%
    removeUnknown() %>%
    fwrite(file = args[1], sep = "\t",
        nThread = as.integer(args[4]))
df <- fread(args[3], stringsAsFactors = FALSE,
    head = FALSE, nThread = as.integer(args[4]))
df <- as.data.frame(df)
df %>%
    fixCollapsed() %>%
    fixDuplicated() %>%
    removeUnknown() %>%
    fwrite(file = args[1], sep = "\t",
        append = TRUE, nThread = as.integer(args[4]))

Use an ensemble of variant callers to call variants from ATAC-seq data (v0.3.3)

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
args <- commandArgs(trailingOnly = TRUE)
test.data<- args[1]
model <- args[2]
output<- args[3]

# load libraries
library(data.table)
library(plyr)
library(dplyr)
suppressMessages(library(mlr))

# load model
print("loading appropriate model")
load(model)

# load test
print("loading and formatting test data")
test<- read.table(test.data, header=TRUE, sep="\t", na.strings=c("NA",".","na","N/A"), skipNul=FALSE, row.names=NULL)

# making predictions
print("making predictions and outputting results")
pred= predict(fit, newdata= test, type="prob")
write.table(pred$data, sep='\t', quote=FALSE, row.names=FALSE, na=".", output)

Improving pathogenicity prediction of missense variants by using AlphaFold-derived features (0.9)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
library(tidyverse)
library(readxl)
library(data.table)
library(optparse)

option_list = list(
  make_option(c("-m", "--mapping"), type="character", default="resources/mapping_genename_uniprot.tsv", 
              help="location of file that maps gene name in assay to UniProt ID"),
  make_option(c("-p", "--prot_folder"), type="character", default="results/predicted_prots_eval", 
              help="Folder with protein files with AlphScore predictions"),
  make_option(c("-a", "--suffix"), type="character", default="_w_AlphScore_red_FALSE.csv.gz", 
              help="suffix that protein files with AlphScore predictions have")
)
opt = parse_args(OptionParser(option_list=option_list))

list_uniprot_ids<-read_tsv(opt$mapping)
PREDICTED_PROT_SUFFIX=opt$suffix
folder_of_prots=opt$prot_folder

# MAPK1	/ Uniprot P28482 not in mapping file, as this was not mapped between Uniprot and dbNSFP

sheets<-c("ADRB2","BRCA1","CALM1", "HRAS", "MAPK1", "P53",  "PTEN", "SUMO1", "TPK1", "TPMT", "UBE2I")
scores_conc<-tibble()

for (gene in sheets){
  scores <- read_excel("resources/Source.xlsx", sheet=gene) %>% 
    select(variant, starts_with("DMS"))%>%
    mutate(gene=gene)%>%
    gather(key="DMS", value="DMS_val", -variant, -gene)
  scores_conc=rbind(scores_conc, scores)
}

#MSH2
scores <- read_excel("resources/MSH2_Jia_2020/1-s2.0-S0002929720304390-mmc2.xlsx", sheet="TableS5_AllVariants")%>%
  rename(variant=Variant, DMS=`LOF score`)%>%
  select(variant, starts_with("DMS"))%>%
  mutate(gene="MSH2")%>%
  gather(key="DMS", value="DMS_val", -variant, -gene)
scores_conc=rbind(scores_conc, scores)

#Abeta
scores <- read_excel("resources/Abeta_Seuma_2020/elife-63364-supp4-v2.xlsx", sheet="1 aa change")%>%
  mutate(variant=paste0(WT_AA, Pos, Mut))%>%
    rename( DMS_nscore=`nscore`)%>%
    select(variant, starts_with("DMS"))%>%
    mutate(gene="Abeta")%>%
    gather(key="DMS", value="DMS_val", -variant, -gene)
scores_conc=rbind(scores_conc, scores)


#VKOR1
scores <- read_excel("resources/VKOR1_Chiasson_2020_activity/elife-58026-fig1-data1-v1.xlsx")%>%
  rename( DMS_abundance_score=`abundance_score`)%>%
  rename( DMS_activity_score=`activity_score`)%>%
  select(variant, starts_with("DMS"))%>%
  mutate(gene="VKOR1")%>%
  gather(key="DMS", value="DMS_val", -variant, -gene)
scores_conc=rbind(scores_conc, scores)

scores_conc<-scores_conc %>% 
  filter(!is.na(DMS_val))%>%
  left_join(list_uniprot_ids, by=c("gene"="gene_name"))%>%
  rename(gene_dms=gene)%>% 
  filter(!is.na(uniprot_id))


files<-paste0(folder_of_prots, list_uniprot_ids$uniprot_id, PREDICTED_PROT_SUFFIX) 

values_joined<-lapply(files, function(x) { #  mc.cores = 1,
  return(fread(x, na.strings = c("NA","."))  ) } )

values_joined<-rbindlist(values_joined)

values_joined<-values_joined%>%
  mutate(variant=paste0(
str_replace_all(RESN_RESI, c( 
 "ALA"="A", "ARG"="R", "ASN"="N", "ASP"="D",
 "CYS"="C", "GLU"="E", "GLN"="Q", "GLY"="G",
 "HIS"="H", "ILE"="I", "LEU"="L", "LYS"="K",
 "MET"="M", "PHE"="F", "PRO"="P", "SER"="S",
 "THR"="T", "TRP"="W", "TYR"="Y", "VAL"="V")),
str_replace_all(to_AS, c( 
  "ALA"="A", "ARG"="R", "ASN"="N", "ASP"="D",
  "CYS"="C", "GLU"="E", "GLN"="Q", "GLY"="G",
  "HIS"="H", "ILE"="I", "LEU"="L", "LYS"="K",
  "MET"="M", "PHE"="F", "PRO"="P", "SER"="S",
  "THR"="T", "TRP"="W", "TYR"="Y", "VAL"="V"))
))

values_joined<-values_joined %>% 
  left_join(scores_conc, by=c("variant"="variant", "Uniprot_acc_split"="uniprot_id"))%>%
  filter(!is.na(DMS_val))%>%
  mutate(DMS_val=as.double(DMS_val))

dir.create("results/validation_set")
setwd("results/validation_set")

fwrite(x=values_joined, file="validation_set_w_AlphScore.csv.gz")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
library(data.table)

files <- list.files(path=snakemake@params[["loaction_csvgzs"]], pattern=".csv.gz", full.names=TRUE, recursive=FALSE)
for (x_files in 1:length(files)){
  if (file.info(files[x_files])$size < 1000){
    files[x_files]<-NA
  }
}
print(files)
files<-files[!is.na(files)]

values_joined<-lapply(files, function(x) { #  mc.cores = 1,
  training_data<-fread(x, na.strings = c("NA","."))[!is.na(CADD_raw_sur), ] # load data

  singletons <- training_data[
    ((is.na(gnomAD_exomes_AC) | (gnomAD_exomes_AC<2 & (gnomAD_exomes_NFE_AC == gnomAD_exomes_AC) ) )  & 
    gnomAD_genomes_AN>90000 &
    (!gnomAD_genomes_flag %in% c("lcr","segdup"))&
    gnomAD_genomes_AC==1 & 
    gnomAD_genomes_NFE_AC==1) & 
    (is.na(`1000Gp3_AC`) | `1000Gp3_AC`==0)  & 
    (is.na(ESP6500_AA_AC) | ESP6500_AA_AC==0) & 
    (is.na(ESP6500_EA_AC) | ESP6500_AA_AC==0) 
    ]

  frequents<- training_data[
    gnomAD_genomes_AF> 0.001 & 
    gnomAD_genomes_AN>90000 & 
    (!gnomAD_genomes_flag %in% c("lcr","segdup"))
    ]

  clinvars<- training_data[
    clinvar_clnsig %in% c("Likely_pathogenic","Pathogenic","Benign", "Likely_benign")
    ]

  return(rbindlist(list(singletons[,`:=`(outcome=1, gnomadSet=1)],
                        frequents[,`:=` (outcome=0, gnomadSet=1)], 
                        clinvars[,`:=` (outcome=ifelse(clinvar_clnsig %in% c("Likely_pathogenic","Pathogenic"),1,0), gnomadSet=0)]), 
                   fill=TRUE, use.names=TRUE))
})

values_joined<-rbindlist(values_joined)

fwrite(values_joined, file=snakemake@output[["csv_gz_out"]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
library(tidyverse)
library(VennDiagram)
futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger") # prevent VennDiagram to write lots of log messages
library(data.table)
library(optparse)
source("workflow/scripts/existing_scores_glm_functions.R")
set.seed(1)

option_list = list(
  make_option(c("-i", "--input"), type="character", default="/media/axel/Dateien/Arbeit_Gen/alphafold2/data_from_xcat_v2/gnomad_extracted_v2.csv.gz", 
              help="csv.gz file"),
  make_option(c("-o", "--output"), type="character", default="results/train_testset1/gnomad_extracted_prepro.csv.gz", 
              help="csv.gz file for output")
)
opt = parse_args(OptionParser(option_list=option_list))

variants<-read_csv(opt$input)
to_AS_table<-read_tsv("resources/to_AS_table.txt")

dir.create(dirname(opt$output))
setwd(dirname(opt$output))

# preprocess variants (see sourced file)
variants<-prepareVariantsForPrediction(variants, to_AS_table)
variants<-splitKFold(variants, 5)
variants<-setTrainTestSet(variants, 1)

# check for overlaps
gnomad_train<-variants %>% filter(gnomad_train)
clinvar_interim_test<-variants %>% filter(clinvar_interim_test)
clinvar_holdout_test<-variants %>% filter(clinvar_holdout_test)

venn.diagram(x=list(gnomad_train$Uniprot_acc_split, clinvar_interim_test$Uniprot_acc_split, clinvar_holdout_test$Uniprot_acc_split),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Protein_id_level.png')
venn.diagram(x=list(gnomad_train$var_id_prot, clinvar_interim_test$var_id_prot, clinvar_holdout_test$var_id_prot),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Protein_variant_id_level.png')
venn.diagram(x=list(gnomad_train$var_id_genomic, clinvar_interim_test$var_id_genomic, clinvar_holdout_test$var_id_genomic),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Variant_id_level.png')

# check for duplicates
n_occur <- data.frame(table(paste(variants$var_id_genomic, variants$gnomadSet)))
double_ids<-str_split(n_occur[n_occur$Freq > 1,]$Var1, " ", simplify=TRUE)[,1]
double_variants<-variants %>% filter(var_id_genomic %in% double_ids) %>% select(var_id_genomic, Uniprot_acc_split, RESN, RESN_RESI, outcome, gnomadSet, alt)
head(double_variants)
# duplicates are different alternative alleles at one position

write_csv(variants, file=basename(opt$output))

Multi-Step Genomic Analysis Pipeline for DeCOI WGS Cohort

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
if (!require("GenomicRanges", quietly = TRUE) & (!require("rtracklayer", quietly = TRUE)) ){
  install.packages("BiocManager", repos='http://cran.us.r-project.org')
  BiocManager::install("GenomicRanges")
  BiocManager::install("rtracklayer")
}

library(tidyverse)
library(data.table)
library(GenomicRanges)
library(rtracklayer)


gene_set_path<-"resources/gene_sets.tsv"
gene_set_path<-snakemake@input[[1]]
bed_path<-"resources/genomic_ranges.bed"
bed_path<-snakemake@input[[2]]
aaf_path<-"results/all_vars_for_RVAS/data.anno.aaf.file.txt"
aaf_path<-snakemake@input[[3]]
anno_path<-"/home/aschmidt/Arbeit_Gen/COVID_WGS/COVID_annotation/results/for_RVAS/all_contigs_anno.file.txt"
anno_path<-snakemake@input[[4]]
bim_path<-"results/all_vars_for_RVAS/eurs.bim"
bim_path<-snakemake@input[[5]]
#order_csq<-"resources/genomic_ranges.bed"
#order_csq<-snakemake@input[[6]]

new_anno_path<-"set.annos.tsv"
new_anno_path<-snakemake@output[[1]]
new_aafs_path<-"aafs.tsv"
new_aafs_path<-snakemake@output[[2]]
new_sets_path<-"sets.tsv"
new_sets_path<-snakemake@output[[3]]
relevant_variants_path<-"relevant_variants.tsv"
relevant_variants_path<-snakemake@output[[4]]
new_bim_path<-"new_bim.bim"
new_bim_path<-snakemake@output[[5]]

aafs<-read_delim(aaf_path, delim=" ", col_names = c("ID", "AAF"))
print("aafs:")
head(aafs)
annos<-read_delim(anno_path, delim=" ", col_names = c("ID", "gene_id", "annot"))
print("annos")
head(annos)

bim<-fread(bim_path, sep ="\t", col.names = c("CHR", "ID", "cM","BP","A1","A2"))

print("bim")
head(bim)

#bim_test<-head(bim, n=5000000)

bim_new_id<-bim %>%
  mutate(IDnew=paste("1", 1:n(), A1, A2, sep=":"))%>%
  mutate(BPnew=1:n())%>%
  mutate(CHRnew=1)

comb_sources<-bim_new_id %>%
  left_join(aafs, by="ID")%>%
  left_join(annos, by="ID")

print("combsource")
head(comb_sources)
head(comb_sources %>% filter(!is.na(annot)))

# make sets based on gene definitions
gene_sets<-read_tsv(gene_set_path)
new_sets<-tibble()
new_annos<-tibble()
new_aaf<-tibble()

score_conseq<-function(conseq){
  conseq_tbl<-c("pLoF",
                "missense.revel07",
                "missense.revel05",
                "missense.revel03",
                "moderate",
                "synonymous",
                "UTR5_CADD",
                "UTR5",
		            "UTR3_CADD",
                "UTR3",
                "promoter_CADD",
                "promoter",
                "enhancer_CADD",
                "enhancer",
                "regionBased")


conseq_rank<-c()
for (single_conseq in conseq){
  single_rank<-which(conseq_tbl %in% single_conseq)
  single_rank<-ifelse(is.na(single_rank), 99, single_rank)
  conseq_rank<-c(conseq_rank, single_rank)
}
conseq_rank<-
return(conseq_rank)
}


i=1
for (single_gene_set in unique(gene_sets$set)){
  rel_genes<-gene_sets %>%
    filter(set==single_gene_set)

  tmp_annos<-comb_sources %>%
    filter(gene_id %in% rel_genes$gene_id)%>%
    filter(annot!="not_relevant")%>%
    mutate(gene_id=single_gene_set)
  if (nrow(tmp_annos)==0){
    next()	
  }

  # check for variants that are added to the gene set more than once, take the most severe conseq
  mltpl_occuring_tmp<-tmp_annos %>% 
    add_count(IDnew)%>%
    filter(n>1)%>%
    select(-n)

  if (nrow(mltpl_occuring_tmp)>0){
    tmp_annos<-tmp_annos %>%
      filter(!IDnew %in% mltpl_occuring_tmp$IDnew)

    for (IDnew_tmp in unique(mltpl_occuring_tmp$IDnew)){
      mltpl_occuring_single<-mltpl_occuring_tmp %>%
        filter(IDnew==IDnew_tmp)%>%
        mutate(eff_score=score_conseq(annot))%>%
        arrange(eff_score) 

      print(IDnew_tmp)
      print(mltpl_occuring_single)

      mltpl_occuring_single<-mltpl_occuring_single %>%
        select(-eff_score)

      tmp_annos<-rbind(tmp_annos, mltpl_occuring_single[1,])
    }
  }

  new_annos<-rbind(new_annos, tmp_annos) 
  tmp_IDs<-paste(tmp_annos$IDnew, collapse=",")
  tmp_set<-tibble(gene_id=single_gene_set, chr="1", pos=i, IDs=tmp_IDs)
  new_sets<-rbind(new_sets, tmp_set)
  i=i+1 
}



#### 

# make sets based on bed file

region_sets<-import(bed_path)
head(region_sets)
region_sets_names<-str_split(region_sets$name, ";", simplify=TRUE)
region_sets_names<-as_tibble(region_sets_names)
colnames(region_sets_names)<- c("set", "gene")
set_names<-unique(region_sets_names$set)

sources_w_AF<-comb_sources %>%
  filter(!is.na(AAF))%>%
  mutate(pos_start=BP,
         pos_end=BP+nchar(A1))

head(sources_w_AF)
chroms<-paste0("chr",sources_w_AF$CHR)
chroms<-str_replace(chroms, "chr23", "chrX")

annos_granges <- GRanges(
  seqnames = chroms,
  ranges = IRanges(start=sources_w_AF$pos_start, end = sources_w_AF$pos_end) )
head(annos_granges)
for (set_name in set_names){

  tmp_region<-region_sets[region_sets_names$set==set_name]
  tmp_intersect<-GenomicRanges::intersect(annos_granges, tmp_region)

  tmp_chr_pos_inReg <-sources_w_AF[annos_granges %in% tmp_intersect,]%>%
    distinct(ID, .keep_all=TRUE)%>%
    mutate(gene_id=set_name,
           annot="regionBased")
  if (nrow(tmp_chr_pos_inReg)==0){
    next()
  }

  new_annos<-rbind(new_annos, tmp_chr_pos_inReg, fill=TRUE)

  tmp_IDs<-paste(tmp_chr_pos_inReg$IDnew, collapse=",")
  tmp_set<-tibble(gene_id=set_name, chr="1", pos=i, IDs=tmp_IDs)
  new_sets<-rbind(new_sets, tmp_set)
  i=i+1 
}


####

new_annos_export<-new_annos %>%
 distinct(IDnew, gene_id, annot)

new_aaf<-comb_sources %>% distinct(IDnew, AAF) %>%
  filter(!is.na(AAF))

relevant_variants<-unique(new_annos_export$ID)

write_delim(x=new_annos_export,
            file = new_anno_path,
            col_names = FALSE )


write_delim(x=new_aaf,
            file = new_aafs_path,
            col_names = FALSE )

write_delim(x=new_sets,
            file = new_sets_path,
            col_names = FALSE )

write(x = relevant_variants,
      file=relevant_variants_path)

bim_new_id<-bim_new_id%>%
  select(CHRnew,IDnew,cM,BPnew,A1,A2)%>%
  relocate(CHRnew,IDnew,cM,BPnew,A1,A2)

write_tsv(x=bim_new_id,
            file = new_bim_path,
            col_names = FALSE )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
library(data.table)
vars<-fread(snakemake@input[[1]], fill=TRUE, header = TRUE, na.strings = ".")
# vars<-fread("results/for_RVAS/annotated_split_vep_1.tsv", fill=TRUE, header = TRUE, na.strings = ".")

vars<-vars[, c("chrom","pos","ref","alt"):=tstrsplit(ID, ":", fixed=TRUE)]

###### MASKS ######
vars<-vars[BIOTYPE == "protein_coding"]

vars<-vars[,REVEL_score:=as.numeric(REVEL_score)]


vars<-vars[, ':=' (revel03=!is.na(REVEL_score) & REVEL_score>0.3, 
                   revel05=!is.na(REVEL_score) & REVEL_score>0.5, 
                   revel07=!is.na(REVEL_score) & REVEL_score>0.7,
                   missense=like(Consequence, "missense_variant"),
                   syn=like(Consequence, "synonymous_variant"),
                   pLOF=(IMPACT=="HIGH"),
                   UTR5=like(Consequence, "5_prime_UTR_variant"),
                   UTR3=like(Consequence, "3_prime_UTR_variant"),
		   promoter=like(Consequence,"upstream_gene_variant") & TSSDistance <= 1000,
                   enhancer=like(Consequence, "upstream_gene_variant")& TSSDistance <= 50000 & !is.na(DHS),
                   CADD10= !(is.na(CADD_PHRED) | CADD_PHRED<10) ) ]

vars<-vars[, ':=' (revel03_mis=(revel03 & missense),
                   revel05_mis=(revel05 & missense), 
                   revel07_mis=(revel07 & missense), 
                   moderate_non_missense=(!missense & (IMPACT=="MODERATE")))]

vars<-vars[, by=c("ID", "Gene"), mask_anno:=ifelse(sum(pLOF)>0,"pLoF", 
                                            ifelse(sum(revel07_mis)>0, "missense.revel07",
                                            ifelse(sum(revel05_mis)>0, "missense.revel05",
                                            ifelse(sum(revel03_mis)>0, "missense.revel03",
                                            ifelse(sum(missense | moderate_non_missense)>0, "moderate",
                                            ifelse(sum(syn)>0, "synonymous", 
                                            ifelse(sum(UTR5)>0 & sum(CADD10)>0, "UTR5_CADD", 
                                            ifelse(sum(UTR5)>0, "UTR5",
                                            ifelse(sum(UTR3)>0 & sum(CADD10)>0, "UTR3_CADD",
                                            ifelse(sum(UTR3)>0, "UTR3", 
                                            ifelse(sum(promoter)>0 & sum(CADD10)>0, "promoter_CADD", 
                                            ifelse(sum(promoter)>0, "promoter", 
                                            ifelse(sum(enhancer)>0 & sum(CADD10)>0, "enhancer_CADD", 
                                            ifelse(sum(enhancer)>0, "enhancer",
                                                   "not_relevant")
                                                                        ) ) ) ) ) ) ) ) ) ) ) ) )
]

anno_list<-rbindlist(list(vars[, c("ID", "Gene", "mask_anno")]), use.names = FALSE)

###### AF ######
vars=vars[, maxAF:=pmax(as.numeric(gnomAD_ex_AF), as.numeric(gnomAD_ge_AF), 0,
                        na.rm = TRUE)]

vars=vars[, minAF:=pmin(as.numeric(gnomAD_ex_AF), as.numeric(gnomAD_ge_AF), 1,
                        na.rm = TRUE)]

vars=vars[, maxAC:=pmax(as.numeric(gnomAD_ex_AC), as.numeric(gnomAD_ge_AC), 0, na.rm = TRUE)]

vars=vars[, maxHom:=pmax(as.numeric(gnomAD_ex_nhomalt), as.numeric(gnomAD_ge_nhomalt), 0, na.rm = TRUE)]


#vars=vars[,AF_category:=ifelse( (maxAF < 0.005) & (maxAC > 9 | maxHom > 0 ), 0.005, maxAF) ]
vars=vars[,AF_category:=maxAF]

###### SET LIST #######
collapsed_vars<-vars[,concats:=paste(ID, collapse=","), by=Gene]
collapsed_vars<-unique(collapsed_vars[,c("Gene","chrom","concats")])
collapsed_vars<-collapsed_vars[,pos:=1:nrow(collapsed_vars)]
collapsed_vars<-collapsed_vars[Gene!="" & !is.na(Gene), ]


fwrite(unique(anno_list), file = snakemake@output[[1]], col.names = FALSE, sep=" ")
fwrite(unique(vars[, c("ID", "AF_category")]), file = snakemake@output[[2]], col.names = FALSE, sep=" ")
fwrite((collapsed_vars[,c("Gene","chrom","pos","concats")]), file = snakemake@output[[3]], col.names = FALSE, sep=" ")

fwrite(vars[, -"concats"], file = snakemake@output[[4]], col.names = TRUE, sep="\t")

pipeline i used for calling sQTL in chimps

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
library(tidyverse)
library(data.table)
library(qvalue)
library(stats)


args = commandArgs(trailingOnly=TRUE)
PermutationsTableFile <- args[1]
ActualTableFile <- args[2]
TableOutFile <- args[3]

## Files for testing
# setwd("~/CurrentProjects/sQTL_pipeline/")
# PermutationsTableFile <- "sQTL_mapping/MatrixEQTL/ConfigCovariateModelResults/PermutationsCombined.txt"
# ActualTableFile <- "sQTL_mapping/MatrixEQTL/ConfigCovariateModelResults/BestPvalueNonPermuted.txt"
# TableOutFile <- "~/temp/test.txt"

# read table of min p values from permutations
permutationTable <- t(fread(PermutationsTableFile, sep='\t', header=T))

# read table of min p values from real data
ActualTable <- fread(ActualTableFile, sep='\t', header=T) %>%
  as.data.frame() %>% tibble::column_to_rownames("Gene")

# Merge the two tables, putting the real data as the first column.
# If you are interested in gene level or cluster-level tests, this where you
# should group and find minimum Pval. Here I did cluster level P-values.
MergedTable <- merge(ActualTable, permutationTable, by="row.names") %>%
  mutate(clusterName=gsub("^(.+?):.+?:.+?:(clu_\\d+).+$", "\\1.\\2", Row.names, perl=T)) %>%
  dplyr::group_by(clusterName) %>% dplyr::summarise_all(dplyr::funs(min)) %>%
  dplyr::ungroup() %>% dplyr::select(-Row.names) %>% as.data.frame() %>%
  tibble::column_to_rownames("clusterName")

PermutationTest <- function(PvalVector){
  return(ecdf(PvalVector)(PvalVector[1]))
}


OutTable <- data.frame(
  BestActualPval=MergedTable$MinP,
  PermutationTestPval=apply(MergedTable, 1, PermutationTest))
OutTable$PermutationTestQvalue <- qvalue(OutTable$PermutationTestPval)$qvalue

OutTable %>%
  tibble::rownames_to_column(var = "GroupedTrait") %>%
  write.table(file=TableOutFile, quote=F, sep='\t')
tool / cran

data.table

Extension of 'data.frame': Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development.