Workflow Steps and Code Snippets
286 tagged steps and code snippets that match keyword data.table
QTL Snakemake Workflow
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | library(yaml) library(data.table) library(devtools) install_github("bmansfeld/QTLseqr",upgrade_dependencies= TRUE) library("QTLseqr") library("ggplot2") config <- yaml.load_file("config.yaml") QTL_Plotting <- function(HighBulk,LowBulk, No_of_Chromosomes){ options(warn=-1) table_file <- 'QTL_VCF_to_Table/QTL_Table.csv' chrom_names<-as.list(unique(fread(table_file,sep = "\t", select = c("CHROM") ))) #chrom_vcf_names<- as.character.Array(droplevels(scan_vcf$`*:*-*`$rowRanges@seqnames@values[1:No_of_Chromosomes])) df <- importFromTable(file = table_file, highBulk = HighBulk, lowBulk = LowBulk, chromList = chrom_names$CHROM[1:No_of_Chromosomes],sep = '\t') ggplot(data = df)+geom_histogram(aes(x = DP.HIGH + DP.LOW))+xlim(0,1000) ggsave(filename="QTL_Plots/DP_Filtering Data",device= "pdf", width=20, height=10) ggplot(data = df)+geom_histogram(aes(x = REF_FRQ)) ggsave(filename="QTL_Plots/REF Frequency for Filtering Data",device = "pdf", width=20, height=10) ggplot(data = df)+geom_histogram(aes(x = SNPindex.HIGH)) ggsave(filename = "QTL_Plots/SNP Index for Filtering Data",device="pdf", width=20, height=10) df_filt <- filterSNPs( SNPset = df, refAlleleFreq = config$QTL_Parser$REF_Allel_Frequency, minTotalDepth = config$QTL_Parser$Min_Total_Depth, maxTotalDepth = config$QTL_Parser$Max_Total_Depth, depthDifference = config$QTL_Parser$Depth_Difference, minSampleDepth = config$QTL_Parser$Min_Sample_Depth, minGQ = config$QTL_Parser$Min_GQ, verbose = TRUE ) df_filt <- runQTLseqAnalysis(df_filt, windowSize = config$QTL_Parser$WindowSize, popStruc = "F2", bulkSize = c(config$QTL_Parser$High_Bulk_Size, config$QTL_Parser$Low_Bulk_Size), replications = 10000, intervals = c(95, 99)) df_filt <- runGprimeAnalysis(df_filt,windowSize = config$QTL_Parser$WindowSize,outlierFilter = "deltaSNP",filterThreshold = config$QTL_Parser$Filter_Threshold) plotGprimeDist(SNPset = df_filt, outlierFilter = "Hampel") ggsave(filename = "QTL_Plots/GPrime Distribution with Hampel Outlier Filter",device = "pdf", width=20, height=10) plotGprimeDist(SNPset =df_filt, outlierFilter = "deltaSNP", filterThreshold = config$QTL_Parser$Filter_Threshold) ggsave(filename = "QTL_Plots/GPrime Distribution with deltaSNP Outlier Filter",device = "pdf", width=20, height=10) p1 <- plotQTLStats(SNPset = df_filt, var = "nSNPs") ggsave(filename = "QTL_Plots/SNP Density Plot",plot=p1,device = "pdf", width=20, height=10) p2 <- plotQTLStats(SNPset = df_filt, var = "deltaSNP", plotIntervals = TRUE) ggsave(filename = "QTL_Plots/Delta SNP Index Plot with Intervals",plot=p2,device = "pdf", width=20, height=10) p3 <- plotQTLStats(SNPset = df_filt, var = "Gprime", plotThreshold = TRUE, q = config$QTL_Parser$FDR_q) ggsave(filename = "QTL_Plots/GPrime Value Plot",plot=p3,device = "pdf", width=20, height=10) } QTL_Plotting(config$QTL_Parser$HighBulk,config$QTL_Parser$LowBulk,config$QTL_Parser$Number_of_Chromosomes) |
Automatic identification, classification and annotation of plant lncRNAs in transcriptomic sequences
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | library(ggpubr) library(dplyr) library(purrr) library(BioQC) library(ggplot2) library(data.table) args <- commandArgs(TRUE) entrop <- function(a) { ent <- entropySpecificity(a, norm=TRUE) return(ent) } trans <- function(a) { a$target_id <- NULL a$type <- NULL a$sum <- NULL entrop(a) } type <- function(a) { col_name <- colnames(a) col_name <- col_name[! col_name %in% c("type", "target_id")] a$sum <- rowSums(a[,col_name], na.rm=TRUE) a <- filter(a, sum >0) ent <- trans(a) a$entropy <- ent return(a) } plot <- function(a,b,c) { ggplot() + geom_density(data = a, aes(x = entropy, fill="Non"), alpha=0.4)+ geom_density(data = b, aes(x = entropy, fill="Cons"), alpha=0.4)+ geom_density(data = c, aes(x = entropy, fill="mRNA"), alpha=0.4)+ scale_colour_manual("", breaks = c("Non", "Cons", "mRNA")) + xlab('entropy') + ylab('density') + guides(fill = guide_legend(title = "Types")) + theme(text = element_text(size = 20)) ggsave("data/output/expression/entropy.png", width = 20, height = 20) } log <- function(a) { col_name <- colnames(a) col_name <- col_name[! col_name %in% c("type", "target_id")] col <- c() for (x in col_name) { col <- append(col, paste("log_", noquote(x), sep = "")) a[, paste("log_", noquote(x), sep = "")] <- log2(a[, noquote(x)]) } new_col <<- col return(a) } # install dataframe df <- read.table(args[1], header = TRUE) # log of columns df_log <- log(df) # exprassin analysis df_log[df_log == -Inf] <- 0 my_comparisons = list(c("consv", "non_cons"), c("consv", "prot"), c("prot", "non_cons")) ggboxplot(df_log, x="type", y= new_col , merge = "flip", ylab = "log2(TPM)", color = "type", palette = "jco")+ font("subtitle", size = 20)+ font("caption", size = 20)+ font("xlab", size = 20)+ font("ylab", size = 20)+ font("xy.text", size = 20) +theme(legend.text=element_text(size=20)) ggsave("data/output/expression/box_plot.png", width = 20, height = 20) ggdensity(df_log, x = new_col , y = "..density..", facet.by = "type", merge = TRUE, xlab = "TPM", rug = TRUE, add = "median", palette = "jco") + font("subtitle", size = 20)+ font("caption", size = 20)+ font("xlab", size = 20)+ font("ylab", size = 20)+ font("xy.text", size = 20) + theme(legend.text=element_text(size=20)) ggsave("data/output/expression/hist_expr.png", width = 20, height = 20) # entropy analysis non <-df[grepl("noncons", df$type),] cons <-df[grepl("consv", df$type),] mRNA <-df[grepl("mRNA", df$type),] non_ent <- type(non) cons_ent <- type(cons) mRNA_ent <- type(mRNA) plot(non_ent, cons_ent, mRNA_ent) |
Metagenomic pipeline managed by snakemake (1.0)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | args <- commandArgs(trailingOnly = TRUE) library(dplyr) library(tidyr) library(data.table) fixCollapsed <- function(df){ colnames(df) <- c("key", "value") df <- df %>% mutate(key = strsplit(key, "; ")) %>% unnest(key) df <- df[, c(2, 1)] return(df) } fixDuplicated <- function(df){ df <- df %>% group_by(key) %>% summarise(value = paste(value, collapse = "; ")) values <- strsplit(df$value, "; ") values <- lapply(values, unique) values <- sapply(values, paste, collapse = "; ") df$value <- values return(df) } removeUnknown <- function(df){ idx <- grepl("^-", df$key) df <- df[!idx,] return(df) } df <- fread(args[2], stringsAsFactors = FALSE, head = FALSE, nThread = as.integer(args[4])) df <- as.data.frame(df) df %>% fixCollapsed() %>% fixDuplicated() %>% removeUnknown() %>% fwrite(file = args[1], sep = "\t", nThread = as.integer(args[4])) df <- fread(args[3], stringsAsFactors = FALSE, head = FALSE, nThread = as.integer(args[4])) df <- as.data.frame(df) df %>% fixCollapsed() %>% fixDuplicated() %>% removeUnknown() %>% fwrite(file = args[1], sep = "\t", append = TRUE, nThread = as.integer(args[4])) |
Use an ensemble of variant callers to call variants from ATAC-seq data (v0.3.3)
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | args <- commandArgs(trailingOnly = TRUE) test.data<- args[1] model <- args[2] output<- args[3] # load libraries library(data.table) library(plyr) library(dplyr) suppressMessages(library(mlr)) # load model print("loading appropriate model") load(model) # load test print("loading and formatting test data") test<- read.table(test.data, header=TRUE, sep="\t", na.strings=c("NA",".","na","N/A"), skipNul=FALSE, row.names=NULL) # making predictions print("making predictions and outputting results") pred= predict(fit, newdata= test, type="prob") write.table(pred$data, sep='\t', quote=FALSE, row.names=FALSE, na=".", output) |
Improving pathogenicity prediction of missense variants by using AlphaFold-derived features (0.9)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | library(tidyverse) library(readxl) library(data.table) library(optparse) option_list = list( make_option(c("-m", "--mapping"), type="character", default="resources/mapping_genename_uniprot.tsv", help="location of file that maps gene name in assay to UniProt ID"), make_option(c("-p", "--prot_folder"), type="character", default="results/predicted_prots_eval", help="Folder with protein files with AlphScore predictions"), make_option(c("-a", "--suffix"), type="character", default="_w_AlphScore_red_FALSE.csv.gz", help="suffix that protein files with AlphScore predictions have") ) opt = parse_args(OptionParser(option_list=option_list)) list_uniprot_ids<-read_tsv(opt$mapping) PREDICTED_PROT_SUFFIX=opt$suffix folder_of_prots=opt$prot_folder # MAPK1 / Uniprot P28482 not in mapping file, as this was not mapped between Uniprot and dbNSFP sheets<-c("ADRB2","BRCA1","CALM1", "HRAS", "MAPK1", "P53", "PTEN", "SUMO1", "TPK1", "TPMT", "UBE2I") scores_conc<-tibble() for (gene in sheets){ scores <- read_excel("resources/Source.xlsx", sheet=gene) %>% select(variant, starts_with("DMS"))%>% mutate(gene=gene)%>% gather(key="DMS", value="DMS_val", -variant, -gene) scores_conc=rbind(scores_conc, scores) } #MSH2 scores <- read_excel("resources/MSH2_Jia_2020/1-s2.0-S0002929720304390-mmc2.xlsx", sheet="TableS5_AllVariants")%>% rename(variant=Variant, DMS=`LOF score`)%>% select(variant, starts_with("DMS"))%>% mutate(gene="MSH2")%>% gather(key="DMS", value="DMS_val", -variant, -gene) scores_conc=rbind(scores_conc, scores) #Abeta scores <- read_excel("resources/Abeta_Seuma_2020/elife-63364-supp4-v2.xlsx", sheet="1 aa change")%>% mutate(variant=paste0(WT_AA, Pos, Mut))%>% rename( DMS_nscore=`nscore`)%>% select(variant, starts_with("DMS"))%>% mutate(gene="Abeta")%>% gather(key="DMS", value="DMS_val", -variant, -gene) scores_conc=rbind(scores_conc, scores) #VKOR1 scores <- read_excel("resources/VKOR1_Chiasson_2020_activity/elife-58026-fig1-data1-v1.xlsx")%>% rename( DMS_abundance_score=`abundance_score`)%>% rename( DMS_activity_score=`activity_score`)%>% select(variant, starts_with("DMS"))%>% mutate(gene="VKOR1")%>% gather(key="DMS", value="DMS_val", -variant, -gene) scores_conc=rbind(scores_conc, scores) scores_conc<-scores_conc %>% filter(!is.na(DMS_val))%>% left_join(list_uniprot_ids, by=c("gene"="gene_name"))%>% rename(gene_dms=gene)%>% filter(!is.na(uniprot_id)) files<-paste0(folder_of_prots, list_uniprot_ids$uniprot_id, PREDICTED_PROT_SUFFIX) values_joined<-lapply(files, function(x) { # mc.cores = 1, return(fread(x, na.strings = c("NA",".")) ) } ) values_joined<-rbindlist(values_joined) values_joined<-values_joined%>% mutate(variant=paste0( str_replace_all(RESN_RESI, c( "ALA"="A", "ARG"="R", "ASN"="N", "ASP"="D", "CYS"="C", "GLU"="E", "GLN"="Q", "GLY"="G", "HIS"="H", "ILE"="I", "LEU"="L", "LYS"="K", "MET"="M", "PHE"="F", "PRO"="P", "SER"="S", "THR"="T", "TRP"="W", "TYR"="Y", "VAL"="V")), str_replace_all(to_AS, c( "ALA"="A", "ARG"="R", "ASN"="N", "ASP"="D", "CYS"="C", "GLU"="E", "GLN"="Q", "GLY"="G", "HIS"="H", "ILE"="I", "LEU"="L", "LYS"="K", "MET"="M", "PHE"="F", "PRO"="P", "SER"="S", "THR"="T", "TRP"="W", "TYR"="Y", "VAL"="V")) )) values_joined<-values_joined %>% left_join(scores_conc, by=c("variant"="variant", "Uniprot_acc_split"="uniprot_id"))%>% filter(!is.na(DMS_val))%>% mutate(DMS_val=as.double(DMS_val)) dir.create("results/validation_set") setwd("results/validation_set") fwrite(x=values_joined, file="validation_set_w_AlphScore.csv.gz") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | library(data.table) files <- list.files(path=snakemake@params[["loaction_csvgzs"]], pattern=".csv.gz", full.names=TRUE, recursive=FALSE) for (x_files in 1:length(files)){ if (file.info(files[x_files])$size < 1000){ files[x_files]<-NA } } print(files) files<-files[!is.na(files)] values_joined<-lapply(files, function(x) { # mc.cores = 1, training_data<-fread(x, na.strings = c("NA","."))[!is.na(CADD_raw_sur), ] # load data singletons <- training_data[ ((is.na(gnomAD_exomes_AC) | (gnomAD_exomes_AC<2 & (gnomAD_exomes_NFE_AC == gnomAD_exomes_AC) ) ) & gnomAD_genomes_AN>90000 & (!gnomAD_genomes_flag %in% c("lcr","segdup"))& gnomAD_genomes_AC==1 & gnomAD_genomes_NFE_AC==1) & (is.na(`1000Gp3_AC`) | `1000Gp3_AC`==0) & (is.na(ESP6500_AA_AC) | ESP6500_AA_AC==0) & (is.na(ESP6500_EA_AC) | ESP6500_AA_AC==0) ] frequents<- training_data[ gnomAD_genomes_AF> 0.001 & gnomAD_genomes_AN>90000 & (!gnomAD_genomes_flag %in% c("lcr","segdup")) ] clinvars<- training_data[ clinvar_clnsig %in% c("Likely_pathogenic","Pathogenic","Benign", "Likely_benign") ] return(rbindlist(list(singletons[,`:=`(outcome=1, gnomadSet=1)], frequents[,`:=` (outcome=0, gnomadSet=1)], clinvars[,`:=` (outcome=ifelse(clinvar_clnsig %in% c("Likely_pathogenic","Pathogenic"),1,0), gnomadSet=0)]), fill=TRUE, use.names=TRUE)) }) values_joined<-rbindlist(values_joined) fwrite(values_joined, file=snakemake@output[["csv_gz_out"]]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | library(tidyverse) library(VennDiagram) futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger") # prevent VennDiagram to write lots of log messages library(data.table) library(optparse) source("workflow/scripts/existing_scores_glm_functions.R") set.seed(1) option_list = list( make_option(c("-i", "--input"), type="character", default="/media/axel/Dateien/Arbeit_Gen/alphafold2/data_from_xcat_v2/gnomad_extracted_v2.csv.gz", help="csv.gz file"), make_option(c("-o", "--output"), type="character", default="results/train_testset1/gnomad_extracted_prepro.csv.gz", help="csv.gz file for output") ) opt = parse_args(OptionParser(option_list=option_list)) variants<-read_csv(opt$input) to_AS_table<-read_tsv("resources/to_AS_table.txt") dir.create(dirname(opt$output)) setwd(dirname(opt$output)) # preprocess variants (see sourced file) variants<-prepareVariantsForPrediction(variants, to_AS_table) variants<-splitKFold(variants, 5) variants<-setTrainTestSet(variants, 1) # check for overlaps gnomad_train<-variants %>% filter(gnomad_train) clinvar_interim_test<-variants %>% filter(clinvar_interim_test) clinvar_holdout_test<-variants %>% filter(clinvar_holdout_test) venn.diagram(x=list(gnomad_train$Uniprot_acc_split, clinvar_interim_test$Uniprot_acc_split, clinvar_holdout_test$Uniprot_acc_split),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Protein_id_level.png') venn.diagram(x=list(gnomad_train$var_id_prot, clinvar_interim_test$var_id_prot, clinvar_holdout_test$var_id_prot),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Protein_variant_id_level.png') venn.diagram(x=list(gnomad_train$var_id_genomic, clinvar_interim_test$var_id_genomic, clinvar_holdout_test$var_id_genomic),category.names = c("gnomAD train" , "ClinVar interim", "ClinVar hold out"), filename = 'Variant_id_level.png') # check for duplicates n_occur <- data.frame(table(paste(variants$var_id_genomic, variants$gnomadSet))) double_ids<-str_split(n_occur[n_occur$Freq > 1,]$Var1, " ", simplify=TRUE)[,1] double_variants<-variants %>% filter(var_id_genomic %in% double_ids) %>% select(var_id_genomic, Uniprot_acc_split, RESN, RESN_RESI, outcome, gnomadSet, alt) head(double_variants) # duplicates are different alternative alleles at one position write_csv(variants, file=basename(opt$output)) |
Multi-Step Genomic Analysis Pipeline for DeCOI WGS Cohort
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | if (!require("GenomicRanges", quietly = TRUE) & (!require("rtracklayer", quietly = TRUE)) ){ install.packages("BiocManager", repos='http://cran.us.r-project.org') BiocManager::install("GenomicRanges") BiocManager::install("rtracklayer") } library(tidyverse) library(data.table) library(GenomicRanges) library(rtracklayer) gene_set_path<-"resources/gene_sets.tsv" gene_set_path<-snakemake@input[[1]] bed_path<-"resources/genomic_ranges.bed" bed_path<-snakemake@input[[2]] aaf_path<-"results/all_vars_for_RVAS/data.anno.aaf.file.txt" aaf_path<-snakemake@input[[3]] anno_path<-"/home/aschmidt/Arbeit_Gen/COVID_WGS/COVID_annotation/results/for_RVAS/all_contigs_anno.file.txt" anno_path<-snakemake@input[[4]] bim_path<-"results/all_vars_for_RVAS/eurs.bim" bim_path<-snakemake@input[[5]] #order_csq<-"resources/genomic_ranges.bed" #order_csq<-snakemake@input[[6]] new_anno_path<-"set.annos.tsv" new_anno_path<-snakemake@output[[1]] new_aafs_path<-"aafs.tsv" new_aafs_path<-snakemake@output[[2]] new_sets_path<-"sets.tsv" new_sets_path<-snakemake@output[[3]] relevant_variants_path<-"relevant_variants.tsv" relevant_variants_path<-snakemake@output[[4]] new_bim_path<-"new_bim.bim" new_bim_path<-snakemake@output[[5]] aafs<-read_delim(aaf_path, delim=" ", col_names = c("ID", "AAF")) print("aafs:") head(aafs) annos<-read_delim(anno_path, delim=" ", col_names = c("ID", "gene_id", "annot")) print("annos") head(annos) bim<-fread(bim_path, sep ="\t", col.names = c("CHR", "ID", "cM","BP","A1","A2")) print("bim") head(bim) #bim_test<-head(bim, n=5000000) bim_new_id<-bim %>% mutate(IDnew=paste("1", 1:n(), A1, A2, sep=":"))%>% mutate(BPnew=1:n())%>% mutate(CHRnew=1) comb_sources<-bim_new_id %>% left_join(aafs, by="ID")%>% left_join(annos, by="ID") print("combsource") head(comb_sources) head(comb_sources %>% filter(!is.na(annot))) # make sets based on gene definitions gene_sets<-read_tsv(gene_set_path) new_sets<-tibble() new_annos<-tibble() new_aaf<-tibble() score_conseq<-function(conseq){ conseq_tbl<-c("pLoF", "missense.revel07", "missense.revel05", "missense.revel03", "moderate", "synonymous", "UTR5_CADD", "UTR5", "UTR3_CADD", "UTR3", "promoter_CADD", "promoter", "enhancer_CADD", "enhancer", "regionBased") conseq_rank<-c() for (single_conseq in conseq){ single_rank<-which(conseq_tbl %in% single_conseq) single_rank<-ifelse(is.na(single_rank), 99, single_rank) conseq_rank<-c(conseq_rank, single_rank) } conseq_rank<- return(conseq_rank) } i=1 for (single_gene_set in unique(gene_sets$set)){ rel_genes<-gene_sets %>% filter(set==single_gene_set) tmp_annos<-comb_sources %>% filter(gene_id %in% rel_genes$gene_id)%>% filter(annot!="not_relevant")%>% mutate(gene_id=single_gene_set) if (nrow(tmp_annos)==0){ next() } # check for variants that are added to the gene set more than once, take the most severe conseq mltpl_occuring_tmp<-tmp_annos %>% add_count(IDnew)%>% filter(n>1)%>% select(-n) if (nrow(mltpl_occuring_tmp)>0){ tmp_annos<-tmp_annos %>% filter(!IDnew %in% mltpl_occuring_tmp$IDnew) for (IDnew_tmp in unique(mltpl_occuring_tmp$IDnew)){ mltpl_occuring_single<-mltpl_occuring_tmp %>% filter(IDnew==IDnew_tmp)%>% mutate(eff_score=score_conseq(annot))%>% arrange(eff_score) print(IDnew_tmp) print(mltpl_occuring_single) mltpl_occuring_single<-mltpl_occuring_single %>% select(-eff_score) tmp_annos<-rbind(tmp_annos, mltpl_occuring_single[1,]) } } new_annos<-rbind(new_annos, tmp_annos) tmp_IDs<-paste(tmp_annos$IDnew, collapse=",") tmp_set<-tibble(gene_id=single_gene_set, chr="1", pos=i, IDs=tmp_IDs) new_sets<-rbind(new_sets, tmp_set) i=i+1 } #### # make sets based on bed file region_sets<-import(bed_path) head(region_sets) region_sets_names<-str_split(region_sets$name, ";", simplify=TRUE) region_sets_names<-as_tibble(region_sets_names) colnames(region_sets_names)<- c("set", "gene") set_names<-unique(region_sets_names$set) sources_w_AF<-comb_sources %>% filter(!is.na(AAF))%>% mutate(pos_start=BP, pos_end=BP+nchar(A1)) head(sources_w_AF) chroms<-paste0("chr",sources_w_AF$CHR) chroms<-str_replace(chroms, "chr23", "chrX") annos_granges <- GRanges( seqnames = chroms, ranges = IRanges(start=sources_w_AF$pos_start, end = sources_w_AF$pos_end) ) head(annos_granges) for (set_name in set_names){ tmp_region<-region_sets[region_sets_names$set==set_name] tmp_intersect<-GenomicRanges::intersect(annos_granges, tmp_region) tmp_chr_pos_inReg <-sources_w_AF[annos_granges %in% tmp_intersect,]%>% distinct(ID, .keep_all=TRUE)%>% mutate(gene_id=set_name, annot="regionBased") if (nrow(tmp_chr_pos_inReg)==0){ next() } new_annos<-rbind(new_annos, tmp_chr_pos_inReg, fill=TRUE) tmp_IDs<-paste(tmp_chr_pos_inReg$IDnew, collapse=",") tmp_set<-tibble(gene_id=set_name, chr="1", pos=i, IDs=tmp_IDs) new_sets<-rbind(new_sets, tmp_set) i=i+1 } #### new_annos_export<-new_annos %>% distinct(IDnew, gene_id, annot) new_aaf<-comb_sources %>% distinct(IDnew, AAF) %>% filter(!is.na(AAF)) relevant_variants<-unique(new_annos_export$ID) write_delim(x=new_annos_export, file = new_anno_path, col_names = FALSE ) write_delim(x=new_aaf, file = new_aafs_path, col_names = FALSE ) write_delim(x=new_sets, file = new_sets_path, col_names = FALSE ) write(x = relevant_variants, file=relevant_variants_path) bim_new_id<-bim_new_id%>% select(CHRnew,IDnew,cM,BPnew,A1,A2)%>% relocate(CHRnew,IDnew,cM,BPnew,A1,A2) write_tsv(x=bim_new_id, file = new_bim_path, col_names = FALSE ) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | library(data.table) vars<-fread(snakemake@input[[1]], fill=TRUE, header = TRUE, na.strings = ".") # vars<-fread("results/for_RVAS/annotated_split_vep_1.tsv", fill=TRUE, header = TRUE, na.strings = ".") vars<-vars[, c("chrom","pos","ref","alt"):=tstrsplit(ID, ":", fixed=TRUE)] ###### MASKS ###### vars<-vars[BIOTYPE == "protein_coding"] vars<-vars[,REVEL_score:=as.numeric(REVEL_score)] vars<-vars[, ':=' (revel03=!is.na(REVEL_score) & REVEL_score>0.3, revel05=!is.na(REVEL_score) & REVEL_score>0.5, revel07=!is.na(REVEL_score) & REVEL_score>0.7, missense=like(Consequence, "missense_variant"), syn=like(Consequence, "synonymous_variant"), pLOF=(IMPACT=="HIGH"), UTR5=like(Consequence, "5_prime_UTR_variant"), UTR3=like(Consequence, "3_prime_UTR_variant"), promoter=like(Consequence,"upstream_gene_variant") & TSSDistance <= 1000, enhancer=like(Consequence, "upstream_gene_variant")& TSSDistance <= 50000 & !is.na(DHS), CADD10= !(is.na(CADD_PHRED) | CADD_PHRED<10) ) ] vars<-vars[, ':=' (revel03_mis=(revel03 & missense), revel05_mis=(revel05 & missense), revel07_mis=(revel07 & missense), moderate_non_missense=(!missense & (IMPACT=="MODERATE")))] vars<-vars[, by=c("ID", "Gene"), mask_anno:=ifelse(sum(pLOF)>0,"pLoF", ifelse(sum(revel07_mis)>0, "missense.revel07", ifelse(sum(revel05_mis)>0, "missense.revel05", ifelse(sum(revel03_mis)>0, "missense.revel03", ifelse(sum(missense | moderate_non_missense)>0, "moderate", ifelse(sum(syn)>0, "synonymous", ifelse(sum(UTR5)>0 & sum(CADD10)>0, "UTR5_CADD", ifelse(sum(UTR5)>0, "UTR5", ifelse(sum(UTR3)>0 & sum(CADD10)>0, "UTR3_CADD", ifelse(sum(UTR3)>0, "UTR3", ifelse(sum(promoter)>0 & sum(CADD10)>0, "promoter_CADD", ifelse(sum(promoter)>0, "promoter", ifelse(sum(enhancer)>0 & sum(CADD10)>0, "enhancer_CADD", ifelse(sum(enhancer)>0, "enhancer", "not_relevant") ) ) ) ) ) ) ) ) ) ) ) ) ) ] anno_list<-rbindlist(list(vars[, c("ID", "Gene", "mask_anno")]), use.names = FALSE) ###### AF ###### vars=vars[, maxAF:=pmax(as.numeric(gnomAD_ex_AF), as.numeric(gnomAD_ge_AF), 0, na.rm = TRUE)] vars=vars[, minAF:=pmin(as.numeric(gnomAD_ex_AF), as.numeric(gnomAD_ge_AF), 1, na.rm = TRUE)] vars=vars[, maxAC:=pmax(as.numeric(gnomAD_ex_AC), as.numeric(gnomAD_ge_AC), 0, na.rm = TRUE)] vars=vars[, maxHom:=pmax(as.numeric(gnomAD_ex_nhomalt), as.numeric(gnomAD_ge_nhomalt), 0, na.rm = TRUE)] #vars=vars[,AF_category:=ifelse( (maxAF < 0.005) & (maxAC > 9 | maxHom > 0 ), 0.005, maxAF) ] vars=vars[,AF_category:=maxAF] ###### SET LIST ####### collapsed_vars<-vars[,concats:=paste(ID, collapse=","), by=Gene] collapsed_vars<-unique(collapsed_vars[,c("Gene","chrom","concats")]) collapsed_vars<-collapsed_vars[,pos:=1:nrow(collapsed_vars)] collapsed_vars<-collapsed_vars[Gene!="" & !is.na(Gene), ] fwrite(unique(anno_list), file = snakemake@output[[1]], col.names = FALSE, sep=" ") fwrite(unique(vars[, c("ID", "AF_category")]), file = snakemake@output[[2]], col.names = FALSE, sep=" ") fwrite((collapsed_vars[,c("Gene","chrom","pos","concats")]), file = snakemake@output[[3]], col.names = FALSE, sep=" ") fwrite(vars[, -"concats"], file = snakemake@output[[4]], col.names = TRUE, sep="\t") |
pipeline i used for calling sQTL in chimps
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | library(tidyverse) library(data.table) library(qvalue) library(stats) args = commandArgs(trailingOnly=TRUE) PermutationsTableFile <- args[1] ActualTableFile <- args[2] TableOutFile <- args[3] ## Files for testing # setwd("~/CurrentProjects/sQTL_pipeline/") # PermutationsTableFile <- "sQTL_mapping/MatrixEQTL/ConfigCovariateModelResults/PermutationsCombined.txt" # ActualTableFile <- "sQTL_mapping/MatrixEQTL/ConfigCovariateModelResults/BestPvalueNonPermuted.txt" # TableOutFile <- "~/temp/test.txt" # read table of min p values from permutations permutationTable <- t(fread(PermutationsTableFile, sep='\t', header=T)) # read table of min p values from real data ActualTable <- fread(ActualTableFile, sep='\t', header=T) %>% as.data.frame() %>% tibble::column_to_rownames("Gene") # Merge the two tables, putting the real data as the first column. # If you are interested in gene level or cluster-level tests, this where you # should group and find minimum Pval. Here I did cluster level P-values. MergedTable <- merge(ActualTable, permutationTable, by="row.names") %>% mutate(clusterName=gsub("^(.+?):.+?:.+?:(clu_\\d+).+$", "\\1.\\2", Row.names, perl=T)) %>% dplyr::group_by(clusterName) %>% dplyr::summarise_all(dplyr::funs(min)) %>% dplyr::ungroup() %>% dplyr::select(-Row.names) %>% as.data.frame() %>% tibble::column_to_rownames("clusterName") PermutationTest <- function(PvalVector){ return(ecdf(PvalVector)(PvalVector[1])) } OutTable <- data.frame( BestActualPval=MergedTable$MinP, PermutationTestPval=apply(MergedTable, 1, PermutationTest)) OutTable$PermutationTestQvalue <- qvalue(OutTable$PermutationTestPval)$qvalue OutTable %>% tibble::rownames_to_column(var = "GroupedTrait") %>% write.table(file=TableOutFile, quote=F, sep='\t') |
tool / cran
data.table
Extension of 'data.frame': Fast aggregation of large data (e.g. 100GB in RAM), fast ordered joins, fast add/modify/delete of columns by group using no copies at all, list columns, friendly and fast character-separated-value read/write. Offers a natural and flexible syntax, for faster development.