BioWorkflows: A snakemake pipeline to generate the transcriptomic signatures of drug susceptibility used in beyondcell

script:
    "../scripts/common_annotate_lines.py"

SnakeMake From line 13 of rules/common.smk

script:
    "../scripts/prism_raw_counts_from_expected_counts.R"

SnakeMake From line 32 of rules/common.smk

script:
    "../scripts/ctrp_generate_curves.R"

SnakeMake From line 22 of rules/ctrp_signatures.smk

script:
    "../scripts/ctrp_generate_ebayes_model.R"

SnakeMake From line 38 of rules/ctrp_signatures.smk

script:
    "../scripts/ctrp_signature_from_ebayes.R"

SnakeMake From line 57 of rules/ctrp_signatures.smk

script:
    "../scripts/ctrp_signature_from_ebayes.R"

SnakeMake From line 75 of rules/ctrp_signatures.smk

script:
    "../scripts/ctrp_generate_drug_db.R"

SnakeMake From line 17 of rules/database.smk

script:
    "../scripts/gdsc_generate_drug_db.R"

SnakeMake From line 38 of rules/database.smk

script:
    "../scripts/gdsc_generate_drug_db.R"

SnakeMake From line 59 of rules/database.smk

script:
    "../scripts/prism_generate_drug_db.R"

SnakeMake From line 78 of rules/database.smk

script:
    "../scripts/common_build_drug_db.R"

SnakeMake From line 99 of rules/database.smk

script:
    "../scripts/dependencies_enrichment_from_ebayes.R"

SnakeMake From line 115 of rules/database.smk

script:
    "../scripts/dependencies_drug_enrichment_from_ebayes.R"

SnakeMake From line 131 of rules/database.smk

script:
    "../scripts/dependencies_annotate_dependencies.R"

SnakeMake From line 16 of rules/dependency_signatures.smk

script:
    "../scripts/dependencies_generate_ebayes_model.R"

SnakeMake From line 34 of rules/dependency_signatures.smk

script:
    "../scripts/dependency_signature_from_ebayes.R"

SnakeMake From line 51 of rules/dependency_signatures.smk

script:
    "../scripts/gdsc_download_raw_files.R"

SnakeMake From line 12 of rules/gdsc_arrays_signatures.smk

script:
    "../scripts/gdsc_normalize_arrays.R"

SnakeMake From line 29 of rules/gdsc_arrays_signatures.smk

script:
    "../scripts/gdsc_generate_curves.R"

SnakeMake From line 50 of rules/gdsc_arrays_signatures.smk

script:
    "../scripts/gdsc_generate_ebayes_model.R"

SnakeMake From line 66 of rules/gdsc_arrays_signatures.smk

script:
    "../scripts/probeID_hgnc_table.R"

SnakeMake From line 83 of rules/gdsc_arrays_signatures.smk

script:
    "../scripts/gdsc_signature_from_ebayes.R"

SnakeMake From line 105 of rules/gdsc_arrays_signatures.smk

script:
    "../scripts/gdsc_signature_from_ebayes.R"

SnakeMake From line 126 of rules/gdsc_arrays_signatures.smk

script:
    "../scripts/gdsc_rna_generate_curves.R"

SnakeMake From line 20 of rules/gdsc_rnaseq_signatures.smk

script:
    "../scripts/gdsc_rna_generate_ebayes_model.R"

SnakeMake From line 38 of rules/gdsc_rnaseq_signatures.smk

script:
    "../scripts/gdsc_rna_signature_from_ebayes.R"

SnakeMake From line 59 of rules/gdsc_rnaseq_signatures.smk

script:
    "../scripts/gdsc_rna_signature_from_ebayes.R"

SnakeMake From line 79 of rules/gdsc_rnaseq_signatures.smk

script:
    "../scripts/metastasis_generate_models.R"

SnakeMake From line 17 of rules/metastasis_signatures.smk

script:
    "../scripts/metastasis_generate_ebayes_model.R"

SnakeMake From line 37 of rules/metastasis_signatures.smk

script:
    "../scripts/metastasis_generate_ebayes_model.R"

SnakeMake From line 57 of rules/metastasis_signatures.smk

script:
    "../scripts/metastasis_geneset_from_ebayes.R"

SnakeMake From line 76 of rules/metastasis_signatures.smk

script:
    "../scripts/metastasis_geneset_from_ebayes.R"

SnakeMake From line 95 of rules/metastasis_signatures.smk

script:
    "../scripts/prism_generate_annotation.R"

SnakeMake From line 18 of rules/prism_signatures.smk

script:
    "../scripts/prism_generate_ebayes_model.R"

SnakeMake From line 36 of rules/prism_signatures.smk

script:
    "../scripts/prism_signature_from_ebayes.R"

SnakeMake From line 59 of rules/prism_signatures.smk

script:
    "../scripts/prism_signature_from_ebayes.R"

SnakeMake From line 79 of rules/prism_signatures.smk

import pandas as pd


def main():

    ###  SNAKEMAKE I/O ###
    raw_celligner = snakemake.input["celligner_lines_data"]
    lines_info = snakemake.input["sample_info"]
    where_to_save = snakemake.output["cell_lines_annotation"]

    raw_celligner = pd.read_csv(
        raw_celligner,
        sep=",",
        usecols=["sampleID", "sampleID_CCLE_Name", "undifferentiated_cluster"],
    )

    ## Get rid of non ccle data
    is_line = raw_celligner["sampleID"].str.startswith("ACH-")

    undifferentiated_lines = raw_celligner.loc[
        (is_line & raw_celligner["undifferentiated_cluster"]), "sampleID"
    ]

    del raw_celligner

    ## Annotate lines data with data from the undifferentiated clusters
    lines_info = pd.read_csv(lines_info, sep=",")
    lines_info["is_undifferentiated"] = lines_info["DepMap_ID"].isin(
        undifferentiated_lines
    )

    # Preserve original lineage
    lines_info["original_lineage"] = lines_info["lineage"]

    # rename the lineages from the undif. lines to "undifferentiated"
    lines_info.loc[lines_info["is_undifferentiated"], "lineage"] = "undifferentiated"

    lines_info.to_csv(where_to_save, sep=",", index=False)


if __name__ == "__main__":
    main()

Python Pandas From line 1 of scripts/common_annotate_lines.py

library("tidyverse")

### SNAKEMAKE I/O ###
ctrp_db        <- snakemake@input[['ctrp_db']]
gdsc_db        <- snakemake@input[['gdsc_db']]
prism_db      <- snakemake@input[['prism_db']]
cell_line_annotation <- snakemake@input[['cell_line_annotation']]

db_save    <- snakemake@output[['full_database']]
db_save_rd <- snakemake@output[['full_database_rd']]

db_summary_save    <- snakemake@output[['drug_summary']]
db_summary_save_rd <- snakemake@output[['drug_summary_rd']]


prism <- read_csv(prism_db) %>%
    select(-cid) %>%
    mutate(project = "PRISM") %>%
    rename(drug_id = broad_id,
           cid = smiles
           )

ctrp  <- read_csv(ctrp_db) %>%
    mutate(project = "CTRP") %>%
    rename(drug_id = broad_id,
           cid = smile
           )

gdsc  <- read_csv(gdsc_db) %>%
    mutate(project = "GDSC") %>%
    rename(cid = pubchem) %>%
    mutate(drug_id = as.character(drug_id))

## Cell line annotation
cell_line_annotation <- read_csv(cell_line_annotation) %>%
    select(DepMap_ID,
           stripped_cell_line_name,
           original_lineage,
           lineage,
           lineage_subtype,
           primary_or_metastasis
           ) %>%
    rename(depmap_id = DepMap_ID,
           cell_line = stripped_cell_line_name,
           origin = original_lineage,
           origin_subtype = lineage_subtype,
           tumor_type = primary_or_metastasis
           )

## attempt to merge the data by outer join and add project 
combined_data <- prism %>%
    bind_rows(ctrp) %>%
    bind_rows(gdsc) %>%
    select(-lineage) %>%
    left_join(cell_line_annotation, "depmap_id") %>%
    rename(celligner_origin = lineage)


## generate summary
drug_summary <- combined_data %>%
    group_by(project, drug_id) %>%
    summarise(name = first(name),
              target = first(target),
              moa = first(moa),
              lines_tested = first(profiled_lines),
              min_auc = min(auc), 
              max_auc = max(auc),
              mean_auc = mean(auc),
              cid = cid
              ) %>%
    distinct()


write_csv(combined_data, file = db_save)
write_csv(drug_summary, file = db_summary_save)

save(combined_data, file = db_save_rd)    
save(drug_summary, file = db_summary_save_rd)    

R tidyverse From line 1 of scripts/common_build_drug_db.R

library("tidyverse")

### SNAKEMAKE I/O ###
dose_response_curves           <- snakemake@input[["curves_data"]]
compound_meta                  <- snakemake@input[["compount_meta"]]
experiment_meta                <- snakemake@input[["experiment_meta"]]
cell_meta                      <- snakemake@input[["cell_meta"]]
cell_lines_annotation          <- snakemake@input[["cell_lines_annotation"]]
count_matrix                   <- snakemake@input[["rna_count_matrix"]]

auc_models_candidates          <- snakemake@output[["auc_models_candidates"]]
compounds_lines_profiled       <- snakemake@output[["compounds_lines_profiled"]]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

## Load tables
drug_data  <- data.table::fread(dose_response_curves) %>%
    as.data.frame()

drug_meta <- data.table::fread(compound_meta) %>%
    as.data.frame()

experiment_meta <- data.table::fread(experiment_meta) %>%
    as.data.frame()

# This table and the below one seem redundant, but CTRP data might be outdated.
# We are loading it ONLY to translate experiment_ids to cell line IDs to match
# to ccle_info
cell_meta <- data.table::fread(cell_meta) %>%
    as.data.frame()

ccle_info <- data.table::fread(cell_lines_annotation) %>%
    as.data.frame() ## I don"t like tibbles


## Load count matrix
ccle_counts <- readRDS(count_matrix)

## Add broad_cpd_id to compounds id
drug_data_annotated <- merge(x=drug_data,
                             y=drug_meta[,c("master_cpd_id",  "broad_cpd_id")],
                             by.x="master_cpd_id",
                             by.y = "master_cpd_id",
                             all.x=TRUE,
                             all.y=FALSE)


## create a named vector of master ccle ids
master_ids <- experiment_meta[!duplicated(experiment_meta$experiment_id), c("experiment_id", "master_ccl_id")] 

drug_data_annotated$master_ccl_id <- master_ids[drug_data_annotated$experiment_id, "master_ccl_id"]

## Use master_ccl_id to  annotated drug_data with ACH depmap ids
drug_data_annotated$human_ccl_name <- cell_meta[drug_data_annotated$master_ccl_id, "ccl_name"]

# get rid of experiments without ccl_name. These are collaborators lines and
# thus no RNA-Seq data is available.
drug_data_annotated_lines <- drug_data_annotated[!is.na(drug_data_annotated$human_ccl_name),]

# now get the ACH ids for the remaining lines
annotations_fields_of_interest <- c("DepMap_ID", "stripped_cell_line_name", "lineage", "is_undifferentiated")

drug_data_annotated_lines_depmap <- merge(x=drug_data_annotated_lines,
                                          y=ccle_info[,annotations_fields_of_interest],
                                          by.x="human_ccl_name",
                                          by.y="stripped_cell_line_name",
                                          all.x=TRUE)


## Get rid of lines withou ACH-id If somehow there is any missing left
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[!is.na(drug_data_annotated_lines_depmap$DepMap_ID),]

## Get rid of compounds without area_under_curve
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[!is.na(drug_data_annotated_lines_depmap$area_under_curve),]

## Get rid of cell lines and curves for which no RNASeq profiling was done
available_lines <- intersect(colnames(ccle_counts), drug_data_annotated_lines_depmap$DepMap_ID)
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[drug_data_annotated_lines_depmap$DepMap_ID %in% available_lines,]

# If there are duplicate cell lines by compound, keep the one with the lowest auc
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[order(drug_data_annotated_lines_depmap$area_under_curve),]
drug_line_duplicated             <- duplicated(drug_data_annotated_lines_depmap[,c("broad_cpd_id", "DepMap_ID")])
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap[!drug_line_duplicated,]

## Get the nº. lines/compound
lines_and_compounds <- drug_data_annotated_lines_depmap %>%
    group_by(broad_cpd_id) %>%
    mutate(area_under_curve = as.numeric(area_under_curve)) %>%
    summarise(
        profiled_lines = n_distinct(DepMap_ID),
        cv = sd(area_under_curve) / mean(area_under_curve)
        ) %>%
    as.data.frame()

# Save this table. It is definitely useful
write.csv(lines_and_compounds, file = compounds_lines_profiled, row.names=FALSE)

## Keep compounds with at least 10 profiled lines
compounds_to_test <- lines_and_compounds %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>% 
    distinct(broad_cpd_id, .keep_all = TRUE) %>%
    pull(broad_cpd_id)

## set lineage if undiff
drug_data_annotated_lines_depmap <- drug_data_annotated_lines_depmap %>% 
    filter(broad_cpd_id %in% compounds_to_test)

drug_data_annotated_lines_depmap[drug_data_annotated_lines_depmap$is_undifferentiated, "lineage"] <- "undifferentiated"

drug_data_annotated_lines_depmap$lineage          <- as.factor(drug_data_annotated_lines_depmap$lineage)
drug_data_annotated_lines_depmap$area_under_curve <- as.numeric(drug_data_annotated_lines_depmap$area_under_curve)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(drug_data_annotated_lines_depmap,
   drug_data_annotated_lines_depmap$broad_cpd_id,
   FUN=function(i) write.csv(i, paste0(auc_models_candidates, "/", i$broad_cpd_id[1], ".csv"), , row.names=FALSE))

R tidyverse From line 1 of scripts/ctrp_generate_curves.R

library("tidyverse")


### SNAKEMAKE I/O ###
compound_data        <- snakemake@input[['compound_data']]
compound_meta        <- snakemake@input[['compound_meta']]
lines_compounds      <- snakemake@input[['lines_compounds']]

comma_file <- snakemake@output[['csv_db']]
rdata      <- snakemake@output[['rdata_db']]

full_table <- compound_data %>%
    map(read_csv) %>%
    reduce(bind_rows)

## Annotate how many lines were profiled by comp.
lines_compounds <- read_csv(lines_compounds) 

## Now keep columns of interest
# broad_id, name, auc, ic50, depmap_id, lineage, moa
filtered_data <- full_table %>%
    select(master_cpd_id,
           broad_cpd_id,
           area_under_curve,
           apparent_ec50_umol,
           DepMap_ID,
           lineage
           ) %>%
    left_join(y = lines_compounds, by = "broad_cpd_id") %>%
    rename(broad_id = broad_cpd_id,
           auc = area_under_curve,
           ec50 = apparent_ec50_umol,
           depmap_id = DepMap_ID)


## We need to annotate the moa for CRTP
compound_meta <- read_tsv(compound_meta) %>%
    select(master_cpd_id,
           cpd_name,
           gene_symbol_of_protein_target,
           target_or_activity_of_compound,
           cpd_smiles,
           ) %>%
    rename(
        name = cpd_name,
        target = gene_symbol_of_protein_target,
        moa = target_or_activity_of_compound,
        smile = cpd_smiles
        )

filtered_annotated_data <- filtered_data %>%
    left_join(compound_meta, by = "master_cpd_id") %>%
    select(-master_cpd_id)


write_csv(x = filtered_annotated_data, file = comma_file)
save(filtered_annotated_data, file = rdata)    

R tidyverse From line 1 of scripts/ctrp_generate_drug_db.R

library('edgeR')
library('limma')

### SNAKEMAKE I/O ###
compound_to_test      <- snakemake@input[['compound_to_test']]
raw_gene_counts       <- snakemake@input[['raw_gene_counts']]

ebayes_model          <- snakemake@output[['ebayes']]

## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
compound_to_test      <- read.csv(compound_to_test)

# This might be the only key difference between this script and Dep/PRISM generation
compound_to_test$area_under_curve            <- as.numeric(compound_to_test$area_under_curve)
compound_to_test$lineage                     <- as.factor(compound_to_test$lineage)

## Subset the counts
lines_to_test <- compound_to_test$DepMap_ID
count_matrix  <- ccle_counts[,lines_to_test]

rownames(compound_to_test) <- compound_to_test$DepMap_ID

## voom model
design <- model.matrix(~lineage + area_under_curve, data=compound_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot=FALSE, normalize.method='quantile')

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)

R limma From line 1 of scripts/ctrp_generate_ebayes_model.R

library('GSEABase')
library('limma')
library('dplyr')


## SNAKEMAKE I/O  ##

eBayes_model       <- snakemake@input[['fitted_bayes']]
drug_info          <- snakemake@input[['treatment_info']]

geneset_directory <- snakemake@output[['bidirectional_geneset']]

compound_id       <- snakemake@wildcards[['broad_id']]

## SNAKEMAKE PARAMS ##
signature_type    <- tolower(as.character(snakemake@params[['signature_type']]))

## If signature_type == 'Classic',
## then we'll take the top/bottom 250 genes sorted by T.statistics
## If signature_type == 'fold', we'll perform FDR + log fold selection

generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between S_genes (negative logfold) 
    ## and R genes (positive)
    if(signature_type=='classic'){
        sensitivity_genes <- deg_genes[deg_genes$t <0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$t >0, 'ID']
    }else{
        sensitivity_genes <- deg_genes[deg_genes$logFC < 0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$logFC > 0, 'ID']
    }
    if(length(sensitivity_genes) >= 15){
       candidate_genes <- extract_top_genes(deg_genes, 'sensitivity', signature_type=signature_type)
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, 'UP', sep='_')

       GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, '/', sig_name, '_UP', '.gmt', sep=''))
    }
    if(length(resistance_genes) >= 15){
        candidate_genes <- extract_top_genes(deg_genes, 'resistance', signature_type=signature_type)
        resistance_set <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, 'DOWN', sep='_')

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, '/', sig_name, '_DOWN', '.gmt', sep='')) 
    }
}

#TODO: This function here is extremely ugly. Refactor it
extract_top_genes <- function(deg_genes, mode='sensitivity', signature_type='classic'){

    ## Extract and sort the top hits for the selected direction/mode
    if(mode=='sensitivity'){

        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(deg_genes$t), 'ID'], n=250)
        }else{
            deg_genes <- head(deg_genes[order(deg_genes$logFC), 'ID'], n=250)
        }
    }else{
        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(-deg_genes$t), 'ID'], n=250)
        }else{
        deg_genes <- head(deg_genes[order(-deg_genes$logFC), 'ID'], n=250)
        }
    }
    return(deg_genes)
}


drug_info <- data.table::fread(drug_info) %>%
    as.data.frame()

drug_info <- drug_info[!duplicated(drug_info$broad_cpd_id),]

bayes_results <- readRDS(eBayes_model)

if(signature_type == 'classic'){
    all_genes <- topTable(bayes_results, coef='area_under_curve', number = Inf, adjust.method='fdr')
}else{ 
    ## Set a threshold of the adjusted p-value if signature_type is not classic
    all_genes <- topTable(bayes_results, coef = 'area_under_curve', 
                     number = Inf, adjust.method = 'fdr',
                     p.value=0.05)
}
# Set the genes as a column
all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}

## Check if we have at least 15 genes left after FDR-cutoff. Does not matter
## If both directions are < 15 individually, we'll account for that later.
## Just make sure we are not evaluating empty results 

if(nrow(all_genes) >= 15){

    common_name <- drug_info[drug_info$broad_cpd_id == compound_id, 'cpd_name']

    if(length(common_name) == 0){
        common_name <- compound_id}

    brd_split   <- strsplit(x=compound_id, split='-', fixed=TRUE)[[1]][2]

    # This code deals with combinations of drugs, to keep names short
    if(grepl(pattern = "mol/mol", x = common_name, fixed = TRUE)){
     common_name <- strsplit(x = common_name, split = " ")[[1]][1]}

    sig_name <- paste(sep='_', common_name,'CTRP', brd_split)

    generate_bidirectional_signature(sig_name, all_genes)

}

R dplyr limma GSEABase From line 1 of scripts/ctrp_signature_from_ebayes.R

library("tidyverse")

###  SNAKEMAKE I/O ###
raw_crispr_data <- snakemake@input[["crispr_gene_dependency_chronos"]]
lines_info <- snakemake@input[["sample_info"]]
ccle_raw_reads <- snakemake@input[["expression_matrix"]]
where_to_save <- snakemake@output[["model_candidates"]]

ccle_counts <- readRDS(ccle_raw_reads)
crispr <- data.table::fread(raw_crispr_data)
annot <-  data.table::fread(lines_info)

## keep lines whose primary disease is cancer AND with available expr.
annot_cancer <- annot %>%
    filter(
        !primary_disease %in% c("Unknown", "Non-Cancerous") 
    )

## drop lineages with too few CCLs
all_lineages <- table(annot_cancer$lineage)
lineages_to_keep <- names(all_lineages[all_lineages >= 5])

## get the names of the CCLs whose lineages are OK AND with RNA-seq
lines_to_keep <- annot_cancer %>%
    filter(
        lineage %in% lineages_to_keep
    ) %>%
    pull(DepMap_ID)

lines_to_keep <- intersect(lines_to_keep, colnames(ccle_counts))

## Filter crispr data
crispr_filtered <- crispr %>%
    filter(
        ModelID %in% lines_to_keep
    )

crispr_lines <- crispr_filtered$ModelID
crispr_filtered$ModelID <- NULL

# CCLE nomenclature is HUGO (ENTREZ). Keep HUGO ONLY
colnames(crispr_filtered) <- stringr::str_remove_all(string = colnames(crispr_filtered),
                                                     pattern = " \\(.*$"
                                                     )

## Resolve duplicates keeping most variable gene
crispr_filtered <- t(crispr_filtered)
colnames(crispr_filtered) <- crispr_lines

vars <- apply(crispr_filtered, 1, var)
crispr_filtered <- cbind(crispr_filtered, vars)
crispr_filtered <- crispr_filtered[order(crispr_filtered[, "vars"],decreasing=TRUE),]

crispr_filtered_unique <- crispr_filtered[!duplicated(rownames(crispr_filtered)), ]
crispr_filtered_unique <- crispr_filtered_unique[, colnames(crispr_filtered_unique) != "vars"]

## Melt the dataset
crispr_probabilities <- crispr_filtered_unique %>%
    as_tibble(rownames = "Gene") %>%
    pivot_longer(
        cols = starts_with("ACH-"),
        names_to = "cell_line",
        values_to = "probability"
        )


# Filter out genes with less than 5 cell lines dependant on them
crispr_enough_power <- crispr_probabilities %>%
    group_by(Gene) %>%
    mutate(dependent_lines = sum(probability >= 0.5)) %>%
    filter(
        dependent_lines >= 5
    )

# annotate the lineage and stripped name
crispr_enough_power_annotated <- crispr_enough_power %>%
    left_join(
        y = annot_cancer[, c("DepMap_ID",
                             "lineage",
                             "stripped_cell_line_name",
                             "original_lineage"
                             )
                         ],
        by = c("cell_line" = "DepMap_ID")
    )


if(!dir.exists(file.path(where_to_save))){
    dir.create(where_to_save)}

by(crispr_enough_power_annotated,
   crispr_enough_power_annotated$Gene,
   FUN=function(i) write.csv(i, paste0(where_to_save, "/", i$Gene[1], ".csv"), , row.names=FALSE))

R tidyverse From line 1 of scripts/dependencies_annotate_dependencies.R

library("limma")
library("fgsea")

### SNAKEMAKE I/O ###
ebayes_model    <- snakemake@input[["ebayes_model"]]
geneset_collection <- snakemake@input[["drugs"]]
enrichment_table  <- snakemake@output[["enrichments"]]

## Load and set data types manually just in case
ebayes_model <- readRDS(ebayes_model)
geneset_collection <- fgsea::gmtPathways(geneset_collection)

## Subset the collection to keep UP drug genesets only
is_up_geneset <- grepl(pattern = "\\.*_UP", x = names(geneset_collection))
geneset_collection <- geneset_collection[is_up_geneset]

## generate a named vector of gene-level stats
## An actual ranking is not needed
t_rank <- topTable(
    fit = ebayes_model,
    coef = "probability",
    number = Inf,
    sort.by = "t"
    )

named_gene_rank <- t_rank$t
names(named_gene_rank) <- rownames(t_rank)

fgsea_res <- fgsea(
    pathways = geneset_collection,
    stats = named_gene_rank,
    minSize = 15,
    maxSize = 500,
    gseaParam = 1,
    scoreType = "pos"
)

fgsea_res$leadingEdge <- sapply(
    fgsea_res$leadingEdge,
    FUN = paste,
    collapse = " "
    )

new_directory <- dirname(enrichment_table)

if (!dir.exists(file.path(new_directory))) {
    dir.create(new_directory)
    }

write.table(
    x = fgsea_res,
    file = enrichment_table,
    sep = "\t",
    row.names = FALSE,
    col.names = TRUE
)

R limma fgsea From line 1 of scripts/dependencies_drug_enrichment_from_ebayes.R

library("limma")
library("fgsea")

### SNAKEMAKE I/O ###
ebayes_model    <- snakemake@input[["ebayes_model"]]
geneset_collection <- snakemake@input[["hallmarks"]]
enrichment_table  <- snakemake@output[["enrichments"]]

## Load and set data types manually just in case
ebayes_model <- readRDS(ebayes_model)
geneset_collection <- fgsea::gmtPathways(geneset_collection)

## generate a named vector of gene-level stats
## An actual ranking is not needed
t_rank <- topTable(
    fit = ebayes_model,
    coef = "probability",
    number = Inf,
    sort.by = "t"
    )

named_gene_rank <- t_rank$t
names(named_gene_rank) <- rownames(t_rank)

fgsea_res <- fgsea(
    pathways = geneset_collection,
    stats = named_gene_rank,
    minSize = 15,
    maxSize = 500,
    gseaParam = 1,
    scoreType = "pos"
)

fgsea_res$leadingEdge <- sapply(
    fgsea_res$leadingEdge,
    FUN = paste,
    collapse = " "
    )

new_directory <- dirname(enrichment_table)

if (!dir.exists(file.path(new_directory))) {
    dir.create(new_directory)
    }

write.table(
    x = fgsea_res,
    file = enrichment_table,
    sep = "\t",
    row.names = FALSE,
    col.names = TRUE
)

R limma fgsea From line 1 of scripts/dependencies_enrichment_from_ebayes.R

library('edgeR')
library('limma')

### SNAKEMAKE I/O ###
dep_to_test           <- snakemake@input[['dependency_to_test']]
raw_gene_counts       <- snakemake@input[['raw_gene_counts']]

ebayes_model          <- snakemake@output[['ebayes']]

## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
dep_to_test           <- read.csv(dep_to_test)

dep_to_test$probability         <- as.numeric(dep_to_test$probability)
dep_to_test$lineage             <- as.factor(dep_to_test$lineage)

## Subset the counts
lines_to_test <- dep_to_test$cell_line
count_matrix  <- ccle_counts[,lines_to_test]

rownames(dep_to_test) <- dep_to_test$cell_line

## voom model
design <- model.matrix(~lineage + probability, data=dep_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design, min.total.count = 100, min.count = 50)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge, method = "TMM")

## TODO: save voom model plot
v <- voom(dge, design = design, normalize.method = "quantile", plot=FALSE)

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)

R limma From line 1 of scripts/dependencies_generate_ebayes_model.R

suppressMessages(library('GSEABase'))
suppressMessages(library('limma'))
suppressMessages(library('dplyr'))


## SNAKEMAKE I/O  ##

eBayes_model       <- snakemake@input[['fitted_bayes']]
geneset_directory  <- snakemake@output[['bidirectional_geneset']]

gene_name          <- snakemake@wildcards[['gene']]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")



generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between dep_associated_genes (positive logfold) 
    ## and non.dep associated genes (negative
    dependency_upregulated         <- deg_genes[deg_genes$logFC > 0, 'ID']
    dependency_downregulated       <- deg_genes[deg_genes$logFC < 0, 'ID']

    if(length(dependency_upregulated) >= 5){

       candidate_genes  <- extract_top_genes(deg_genes, 'dependent_up')
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, 'UP', sep='_')

        GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, '/', sig_name, '_dependency_UP', '.gmt', sep=''))
    }
    if(length(dependency_downregulated) >= 5){
        candidate_genes <- extract_top_genes(deg_genes, 'dependent_down')
        resistance_set  <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, 'DOWN', sep='_')

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, '/', sig_name, '_dependency_DOWN', '.gmt', sep='')) 
    }
}

extract_top_genes <- function(deg_genes, mode='dependent_up'){

    if(mode=='dependent_up'){
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = TRUE), 'ID'], n=250)
    }else{
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = FALSE), 'ID'], n=250)
    }

    return(deg_genes)
}


bayes_results <- readRDS(eBayes_model)

all_genes <- topTable(bayes_results, coef = 'probability', number = Inf, adjust.method = 'fdr')

all_genes <- all_genes[all_genes$adj.P.Val <= 0.05, ]

all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}


if(nrow(all_genes) >= 5){

    sig_name <- paste(sep='_', gene_name, 'DepMap', '22Q2')
    generate_bidirectional_signature(sig_name, all_genes)
}

R dplyr limma GSEABase From line 1 of scripts/dependency_signature_from_ebayes.R

library('ArrayExpress')

### SNAKEMAKE I/O ###
raw_cel_path      <- snakemake@output[['raw_cel_files']]

if(!dir.exists(file.path(raw_cel_path))){
    dir.create(raw_cel_path)}

# E-MTAB-3610 is the ArrayExpress ID of the GDSC-1000 profiling array
array_set = ArrayExpress::getAE('E-MTAB-3610', path=raw_cel_path, type='raw')

R ArrayExpress From line 1 of scripts/gdsc_download_raw_files.R

library("tidyverse")

### SNAKEMAKE I/O ###
gdsc_dose_response_curves      <- snakemake@input[["dose_response_curves"]]
cell_lines_annotation          <- snakemake@input[["cell_lines_annotation"]]
array_metadata                 <- snakemake@input[["array_metadata"]]

auc_models_candidates          <- snakemake@output[["auc_models_candidates"]]
compounds_lines_profiled       <- snakemake@output[["compounds_lines_profiled"]]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load tables

candidate_curves <- readxl::read_xlsx(gdsc_dose_response_curves) %>%
                    filter(RMSE <= 0.3) %>%
                    as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                        as.data.frame()

array_metadata <- data.table::fread(array_metadata) %>%
                  as.data.frame()

## Filter out drug/lines associations where the line is not profiled
## use the same column as gdsc_normalize_arrays
available_lines    <- array_metadata[, "Characteristics[cell line]"]
rm(array_metadata)

## Match SANGER and CCLE info
## I Manually tested it and only 1 line cannot be matched
sanger_lines <- unique(candidate_curves$SANGER_MODEL_ID)
ccle_lines   <- unique(cell_lines_annotation$Sanger_Model_ID)

common_lines <- intersect(sanger_lines, ccle_lines)

## Keep curves present in both the array AND the CCLE metadata
candidate_curves <- filter(candidate_curves, SANGER_MODEL_ID %in% common_lines, CELL_LINE_NAME %in% available_lines)

## Annotate cell line histology
candidate_curves <- merge(x=candidate_curves,
                         y=cell_lines_annotation[,c("Sanger_Model_ID", "lineage")],
                         by.x="SANGER_MODEL_ID",
                         by.y="Sanger_Model_ID",
                         all.x=TRUE,
                         all.y=FALSE)

## get the number of profiled lines/compound
lines_by_compound <- candidate_curves %>%
                     group_by(DRUG_ID) %>%
                     summarise(
                        profiled_lines = n_distinct(SANGER_MODEL_ID),
                        cv  = sd(AUC) / mean(AUC)
                        ) %>%
                     as.data.frame()

##TODO: SAVE THIS
write.csv(lines_by_compound, file=compounds_lines_profiled, row.names=FALSE)

## Get rid of models with less than 10 profiled lines
compounds_to_test <- lines_by_compound %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>% 
    distinct(DRUG_ID, .keep_all = TRUE) %>%
    pull(DRUG_ID)

candidate_curves   <- filter(candidate_curves, DRUG_ID %in% compounds_to_test)

candidate_curves$lineage <- as.factor(candidate_curves$lineage)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(candidate_curves,
   candidate_curves$DRUG_ID,
   FUN=function(i) write.csv(i, paste0(auc_models_candidates, "/", i$DRUG_ID[1], ".csv"), , row.names=FALSE))

R tidyverse From line 2 of scripts/gdsc_generate_curves.R

library("tidyverse")


### SNAKEMAKE I/O ###
signatures_data        <- snakemake@input[["signatures_data"]]
cell_line_annotation <- snakemake@input[["cell_line_annotation"]]
lines_compounds      <- snakemake@input[["lines_compounds"]]
compound_meta        <- snakemake@input[["compound_meta"]]

comma_file <- snakemake@output[["csv_db"]]
rdata      <- snakemake@output[["rdata_db"]]

full_table <- signatures_data %>%
    map(read_csv) %>%
    reduce(bind_rows)

## Annotate how many lines were profiled by comp.
lines_compounds <- read_csv(lines_compounds) 

## load compound metadata
compound_meta <- read_csv(compound_meta) %>%
    filter(`Sample Size` == "GDSC2") %>%
    select(drug_id, PubCHEM)

## Now keep columns of interest
# broad_id, name, auc, ic50, depmap_id, lineage, moa
filtered_data <- full_table %>%
    select(DRUG_ID,
           DRUG_NAME,
           AUC,
           LN_IC50,
           SANGER_MODEL_ID,
           lineage,
           PUTATIVE_TARGET,
           PATHWAY_NAME
           ) %>%
    left_join(y = lines_compounds, by = "DRUG_ID") %>%
    rename(drug_id = DRUG_ID,
           name = DRUG_NAME,
           auc = AUC,
           ic50 = LN_IC50,
           cell_id = SANGER_MODEL_ID,
           moa = PATHWAY_NAME,
           target = PUTATIVE_TARGET) %>%
    left_join(y = compound_meta, by = "drug_id") %>%
    rename(
        pubchem = PubCHEM
    )

## Attempt to get DepMap IDs using SANGER passport
cell_line_annotation <- read_csv(cell_line_annotation) %>%
    select(Sanger_Model_ID, DepMap_ID) %>%
    rename(cell_id = Sanger_Model_ID,
           depmap_id = DepMap_ID) %>%
    filter(!is.na(cell_id))

filtered_annotated_data <- filtered_data %>%
    left_join(cell_line_annotation, by = "cell_id") %>%
    select(-cell_id)

write_csv(x = filtered_annotated_data, file = comma_file)
save(filtered_annotated_data, file = rdata)    

R tidyverse From line 1 of scripts/gdsc_generate_drug_db.R

library('limma')

### SNAKEMAKE I/O ###
compound_to_test        <- snakemake@input[['compound_to_test']]
normalized_arrays       <- snakemake@input[['normalized_arrays']]
ebayes_model            <- snakemake@output[['ebayes']]

## Load and set data types manually just in case
normalized_arrays           <- readRDS(normalized_arrays)
compound_to_test            <- read.csv(compound_to_test)

compound_to_test$AUC            <- as.numeric(compound_to_test$AUC)
compound_to_test$lineage        <- as.factor(compound_to_test$lineage)


## Subset the arrays
lines_to_test      <- compound_to_test$CELL_LINE_NAME
normalized_arrays  <- normalized_arrays[,lines_to_test]

rownames(compound_to_test) <- compound_to_test$CELL_LINE_NAME

design <- model.matrix(~lineage + AUC, data=compound_to_test)

## reorder count_matrix so that cols matches rows from design
normalized_arrays <- normalized_arrays[,rownames(design)]

fit <- lmFit(normalized_arrays, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)

R limma From line 1 of scripts/gdsc_generate_ebayes_model.R

library('oligo')
library('oligoClasses')

### SNAKEMAKE I/O ###
raw_cel_path      <- snakemake@input[['cel_files']]
normalized_arrays <- snakemake@output[['normalized_arrays']]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

sdrf_location <- file.path(raw_cel_path, "E-MTAB-3610.sdrf.txt")
SDRF          <- read.delim(sdrf_location)


rownames(SDRF) <- SDRF$Array.Data.File
SDRF           <- AnnotatedDataFrame(SDRF)

raw_exprset <- oligo::read.celfiles(filenames = file.path(raw_cel_path, 
                                    SDRF$Array.Data.File),
                                    verbose = FALSE, phenoData = SDRF)


norm_eset <- oligo::rma(raw_exprset, background = TRUE, normalize = TRUE)

## Save it as a matrix directly, no need to keep the whole object.
## Make sure to change colnames 
pdata <- phenoData(norm_eset) 
pdata <- as.data.frame(pdata@data)

eset            <- exprs(norm_eset)
colnames(eset)  <- pdata[colnames(eset), 'Characteristics.cell.line.']

saveRDS(eset, normalized_arrays)

R oligo oligoClasses From line 1 of scripts/gdsc_normalize_arrays.R

library("tidyverse")

### SNAKEMAKE I/O ###
gdsc_dose_response_curves      <- snakemake@input[["dose_response_curves"]]
cell_lines_annotation          <- snakemake@input[["cell_lines_annotation"]]
count_matrix                   <- snakemake@input[["count_matrix"]]

auc_models_candidates          <- snakemake@output[["auc_models_candidates"]]
compounds_lines_profiled       <- snakemake@output[["compounds_lines_profiled"]]

# Setup log file
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load tables

candidate_curves <- readxl::read_xlsx(gdsc_dose_response_curves) %>%
                    filter(RMSE <= 0.3) %>%
                    as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                        as.data.frame()

ccle_counts <- readRDS(count_matrix)

## Filter out drug/lines associations where the line is not profiled
available_lines    <- colnames(ccle_counts)
rm(ccle_counts)

## common lines check
common_lines <- intersect(
    candidate_curves$SANGER_MODEL_ID, 
    cell_lines_annotation$Sanger_Model_ID
    )

## Filter the curves based on expression availability and annotation
candidate_curves_filtered <- candidate_curves %>%
    filter(
        SANGER_MODEL_ID %in% common_lines
    ) %>%
    left_join(
        y = cell_lines_annotation[, c("Sanger_Model_ID", "DepMap_ID", "lineage")],
        by = c("SANGER_MODEL_ID" = "Sanger_Model_ID")
    ) %>%
    filter(
        DepMap_ID %in% available_lines
    )

## get the number of profiled lines/compound
lines_by_compound <- candidate_curves_filtered %>%
    group_by(DRUG_ID) %>%
    summarise(
        profiled_lines = n_distinct(SANGER_MODEL_ID),
        cv  = sd(AUC) / mean(AUC)
    ) %>%
    as.data.frame()

write.csv(lines_by_compound, file=compounds_lines_profiled, row.names=FALSE)

## Get rid of models with less than 10 profiled lines
compounds_to_test <- lines_by_compound %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>% 
    distinct(DRUG_ID, .keep_all = TRUE) %>%
    pull(DRUG_ID)

candidate_curves_filtered <- filter(candidate_curves_filtered,
                                    DRUG_ID %in% compounds_to_test
                                    )

candidate_curves_filtered$lineage <- as.factor(candidate_curves_filtered$lineage)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(candidate_curves_filtered,
   candidate_curves_filtered$DRUG_ID,
   FUN=function(i) write.csv(i, 
                             paste0(auc_models_candidates, 
                                    "/", i$DRUG_ID[1], ".csv"), ,
                             row.names=FALSE)
   )

R tidyverse From line 2 of scripts/gdsc_rna_generate_curves.R

suppressMessages(library("edgeR"))
suppressMessages(library("limma"))

### SNAKEMAKE I/O ###
compound_to_test      <- snakemake@input[["compound_to_test"]]
raw_gene_counts       <- snakemake@input[["raw_gene_counts"]]

ebayes_model          <- snakemake@output[["ebayes"]]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
compound_to_test      <- read.csv(compound_to_test)

compound_to_test$AUC            <- as.numeric(compound_to_test$AUC)
compound_to_test$lineage        <- as.factor(compound_to_test$lineage)

## Subset the counts
lines_to_test <- compound_to_test$DepMap_ID
count_matrix  <- ccle_counts[, lines_to_test]

rownames(compound_to_test) <- compound_to_test$DepMap_ID

## voom model
design <- model.matrix(~lineage + AUC, data = compound_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[, rownames(design)]

dge  <- DGEList(counts = count_matrix)
keep <- filterByExpr(dge, design = design)

dge <- dge[keep, keep.lib.sizes = FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot = FALSE, normalize.method = "quantile")

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)

R limma From line 1 of scripts/gdsc_rna_generate_ebayes_model.R

suppressMessages(library("openxlsx"))
suppressMessages(library("GSEABase"))
suppressMessages(library("limma"))
suppressMessages(library("tidyverse"))

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## SNAKEMAKE I/O  ##
eBayes_model       <- snakemake@input[["fitted_bayes"]]
drug_info          <- snakemake@input[["treatment_info"]]
geneset_directory <- snakemake@output[["bidirectional_geneset"]]
compound_id       <- snakemake@wildcards[["drug_id"]]

## SNAKEMAKE PARAMS ##
signature_type    <- tolower(as.character(snakemake@params[["signature_type"]]))

# Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

## If signature_type == "Classic",
## then we"ll take the top/bottom 250 genes sorted by T.statistics
## If signature_type == "fold", we"ll perform FDR + log fold selection

generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between S_genes (negative logfold) 
    ## and R genes (positive)
    if (signature_type == "classic") {
        sensitivity_genes <- deg_genes[deg_genes$t < 0, "ID"]
        resistance_genes  <- deg_genes[deg_genes$t > 0, "ID"]
    }else {
        sensitivity_genes <- deg_genes[deg_genes$logFC < 0, "ID"]
        resistance_genes  <- deg_genes[deg_genes$logFC > 0, "ID"]
    }
    if (length(sensitivity_genes) >= 15) {
       candidate_genes <- extract_top_genes(deg_genes, "sensitivity", signature_type=signature_type)
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, "UP", sep = "_")

       GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, "/", sig_name, "_UP", ".gmt", sep=""))
    }
    if (length(resistance_genes) >= 15){
        candidate_genes <- extract_top_genes(deg_genes, "resistance", signature_type=signature_type)
        resistance_set <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, "DOWN", sep = "_")

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, "/", sig_name, "_DOWN", ".gmt", sep="")) 
    }
}

#TODO: This function here is extremely ugly. Refactor it
extract_top_genes <- function(deg_genes, mode = "sensitivity", signature_type = "classic"){

    ## Extract and sort the top hits for the selected direction/mode
    if (mode == "sensitivity"){

        if(signature_type == "classic"){
            deg_genes <- head(deg_genes[order(deg_genes$t), "ID"], n=250)
        }else{
            deg_genes <- head(deg_genes[order(deg_genes$logFC), "ID"], n=250)
        }
    }else{
        if(signature_type=="classic"){
            deg_genes <- head(deg_genes[order(-deg_genes$t), "ID"], n=250)
        }else{
        deg_genes <- head(deg_genes[order(-deg_genes$logFC), "ID"], n=250)
        }
    }
    return(deg_genes)
}


# Get treatment annotation
drug_info <- read.xlsx(drug_info)
drug_info <- drug_info %>% dplyr::select(DRUG_ID, DRUG_NAME) %>% unique

bayes_results <- readRDS(eBayes_model)

if(signature_type == "classic"){
    all_genes <- topTable(bayes_results, coef="AUC", number = Inf, adjust.method="fdr")
}else{ 
    ## Set a threshold of the adjusted p-value if signature_type is not classic
    all_genes <- topTable(bayes_results, coef = "AUC", 
                     number = Inf, adjust.method = "fdr",
                     p.value=0.05)
}
# Set the genes as a column
all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}

## Check if we have at least 15 genes left after FDR-cutoff. Does not matter
## If both directions are < 15 individually, we"ll account for that later.
## Just make sure we are not evaluating empty results 

if(nrow(all_genes) >= 15){

    common_name <- drug_info[drug_info$DRUG_ID == compound_id, "DRUG_NAME"]

    if (length(common_name) == 0){
        common_name <- compound_id
    }

    sig_name <- paste(sep="_", common_name, "GDSC", compound_id)

    generate_bidirectional_signature(sig_name, all_genes)

}

R tidyverse limma openxlsx GSEABase From line 1 of scripts/gdsc_rna_signature_from_ebayes.R

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressMessages(library("limma"))
suppressMessages(library("openxlsx"))
suppressMessages(library("tidyverse"))
suppressMessages(library("GSEABase"))

## SNAKEMAKE I/O ##
eBayes_model <- snakemake@input[["fitted_bayes"]]
drug_info <- snakemake@input[["treatment_info"]]
hgnc <- snakemake@input[["hgnc"]]

geneset_directory <- snakemake@output[["bidirectional_geneset"]]

## SNAKEMAKE PARAMS ##
signature_type <- tolower(as.character(snakemake@params[['signature_type']]))

## FUNCTION ##
create.gmt <- function(x, mode) {
  gset <- GeneSet(x, geneIdType = SymbolIdentifier())
  setName(gset) <- paste(sig_name, mode, sep = "_")
  collapsed_name <- str_replace_all(sig_name, pattern = " ", repl = "")
  toGmt(gset, con = paste0(geneset_directory, '/', collapsed_name, "_", mode, 
                           ".gmt"))
}

## CODE ##
# Create output directory
dir.create(geneset_directory, showWarnings = FALSE)

# Get Bayes result
bayes_results <- readRDS(eBayes_model)

# Get treatment annotation
drug_info <- read.xlsx(drug_info)
drug_info <- drug_info %>% dplyr::select(DRUG_ID, DRUG_NAME) %>% unique

# Get signature name
sigID <- gsub("_eBayes.rds", "", basename(eBayes_model))
common_name <- drug_info$DRUG_NAME[match(sigID, drug_info$DRUG_ID, 
                                         nomatch = sigID)]
sig_name <- paste(common_name, "GDSC", sigID, sep = "_")

# Get probe ID and HUGO symbol correspondence
hgnc <- read.table(hgnc, sep = "\t", header = TRUE)

# Variables depending on signature_type
if (signature_type == "classic") {
  p.val = 1
  magnitude <- "t"
} else if (signature_type == "fold") {
  p.val = 0.05
  magnitude <- "logFC"
}

# Get top genes table
top_genes <- topTable(bayes_results, coef = "AUC", number = Inf, 
                      adjust.method = "fdr", p.value = p.val)

# Continue if the table is not empty
if (nrow(top_genes) != 0) {
  # Make the rownames a new column
  top_genes$probe <- rownames(top_genes)

  # Match with HUGO symbols
  top_genes <- na.omit(merge(top_genes, hgnc[, c("probe", "hgnc")], all.x = TRUE))

  # If there are several values for the same gene, keep the one with the highest 
  # absolute value of "magnitude"
  top_genes <- top_genes %>% group_by(hgnc) %>% 
    dplyr::slice(which.max(get(magnitude)))

  # Create bidirectional signature
  if (signature_type == "classic") {
    sensitivity <- top_genes %>% filter(t < 0)
    resistance <- top_genes %>% filter(t > 0)
  } else if (signature_type == "fold") {
    sensitivity <- top_genes %>% filter(logFC < 0)
    resistance <- top_genes %>% filter(logFC > 0)
  }

  # Keep the 250 top/bottom genes
  sensitivity <- head(sensitivity, n = 250) %>% arrange(desc(get(magnitude))) %>% 
    dplyr::select(hgnc) %>% pull
  resistance <- tail(resistance, n = 250) %>% arrange(get(magnitude)) %>%
    dplyr::select(hgnc) %>% pull

  # Ugly fix to keep fold signatures at 250. We really should refactor these
  if (signature_type == "fold"){
    sensitivity <- head(sensitivity, n = 250)
    resistance <- head(resistance, n = 250)
  }

  # Create a gmt if the number of genes > 15
  if (length(sensitivity) >= 15) {
    create.gmt(sensitivity, "UP")
  }
  if (length(resistance) >= 15) {
    create.gmt(resistance, "DOWN")
  }
}

R tidyverse limma openxlsx GSEABase From line 1 of scripts/gdsc_signature_from_ebayes.R

suppressMessages(library("edgeR"))
suppressMessages(library("limma"))

### SNAKEMAKE I/O ###
met_type_to_test      <- snakemake@input[["mets_to_test"]]
raw_gene_counts       <- snakemake@input[["raw_gene_counts"]]

ebayes_model          <- snakemake@output[["ebayes"]]

### SNAKEMAKE PARAMS ###
model_type <- snakemake@params[["model_type"]]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
met_type_to_test      <- read.csv(met_type_to_test)

## Sanity check
met_type_to_test$mean            <- as.numeric(met_type_to_test$mean)
met_type_to_test$penetrance      <- as.numeric(met_type_to_test$penetrance)
met_type_to_test$lineage        <- as.factor(met_type_to_test$lineage)

## Subset the counts
lines_to_test <- met_type_to_test$DepMap_ID
count_matrix  <- ccle_counts[, lines_to_test]

rownames(met_type_to_test) <- met_type_to_test$DepMap_ID

## voom model
if(model_type == "mean"){
    design <- model.matrix(~lineage + mean, data=met_type_to_test)
}else{
    design <- model.matrix(~lineage + penetrance, data=met_type_to_test)
}

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[ ,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot=FALSE, normalize.method="quantile")

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)

R limma From line 1 of scripts/metastasis_generate_ebayes_model.R

library("tidyverse")


### SNAKEMAKE I/O ###
metastasis_penetrance        <- snakemake@input[["metastasis_data"]]
cell_lines_annotation  <- snakemake@input[["cell_lines_annotation"]]
count_matrix           <- snakemake@input[["count_matrix"]]

met_model_candidates   <- snakemake@output[["met_model_candidates"]]
met_lines_profiled      <- snakemake@output[["met_lines_profiled"]]

## load tables
metastasis_penetrance <- readr:::read_csv(metastasis_penetrance) %>%
                  as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                         as.data.frame()

ccle_counts <- readRDS(count_matrix)

metastasis_penetrance <- left_join(
    x = metastasis_penetrance,
    y = cell_lines_annotation,
    by = c("cell_line" = "CCLE_Name")
)  

## Get rid of cell lines and met data for which no RNASeq profiling was done
available_lines <- intersect(colnames(ccle_counts), metastasis_penetrance$DepMap_ID)

selected_models <- metastasis_penetrance %>%
    filter(DepMap_ID %in% available_lines) %>%
    select(
        met_model,
        DepMap_ID,
        mean, 
        penetrance,
        cell_line,
        lineage,
        original_lineage,
        primary_or_metastasis
        ) %>%
    mutate(
        lineage = as_factor(lineage),
        met_model = as_factor(met_model)
    )

## Get the nº. lines/compound
lines_and_mets <- selected_models %>%
    group_by(met_model) %>%
    summarise(profiled_lines = n_distinct(DepMap_ID)) %>%
    as.data.frame()

write.csv(lines_and_mets, file = met_lines_profiled, row.names=FALSE)


if(!dir.exists(file.path(met_model_candidates))){
    dir.create(met_model_candidates)}

by(selected_models,
   selected_models$met_model,
   FUN=function(i) write.csv(i, paste0(met_model_candidates, '/', i$met_model[1], ".csv"), , row.names=FALSE))

R tidyverse From line 1 of scripts/metastasis_generate_models.R

suppressMessages(library("GSEABase"))
suppressMessages(library("limma"))
suppressMessages(library("dplyr"))


## SNAKEMAKE I/O  ##

eBayes_model       <- snakemake@input[["fitted_bayes"]]
geneset_directory  <- snakemake@output[["bidirectional_geneset"]]

met_name          <- snakemake@wildcards[["met_type"]]
model_type        <- snakemake@params[["model_type"]]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")



generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between dep_associated_genes (positive logfold) 
    ## and non.dep associated genes (negative
    dependency_upregulated         <- deg_genes[deg_genes$logFC > 0, "ID"]
    dependency_downregulated       <- deg_genes[deg_genes$logFC < 0, "ID"]

    if(length(dependency_upregulated) >= 5){

       candidate_genes  <- extract_top_genes(deg_genes, "met_UP")
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, "UP", sep="_")

        GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, "/", sig_name, "_", model_type, "_met_UP", ".gmt", sep=""))
    }
    if(length(dependency_downregulated) >= 5){
        candidate_genes <- extract_top_genes(deg_genes, "met_DOWN")
        resistance_set  <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, "DOWN", sep="_")

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, "/", sig_name, "_", model_type, "_met_DOWN", ".gmt", sep="")) 
    }
}

extract_top_genes <- function(deg_genes, mode="met_UP"){

    if(mode=="met_UP"){
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = TRUE), "ID"], n=250)
    }else{
        deg_genes <- head(deg_genes[order(deg_genes$logFC, decreasing = FALSE), "ID"], n=250)
    }

    return(deg_genes)
}


bayes_results <- readRDS(eBayes_model)

all_genes <- topTreat(bayes_results, coef = model_type, 
                     number = Inf, adjust.method = "fdr")

all_genes <- all_genes[all_genes$adj.P.Val <= 0.05, ]

all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}


if(nrow(all_genes) >= 5){

    sig_name <- paste(sep="_", met_name, model_type, "MetMap", "2022")
    generate_bidirectional_signature(sig_name, all_genes)
}

R dplyr limma GSEABase From line 1 of scripts/metastasis_geneset_from_ebayes.R

library("tidyverse")


### SNAKEMAKE I/O ###
response_curves        <- snakemake@input[["response_curves"]]
cell_lines_annotation  <- snakemake@input[["cell_lines_annotation"]]
count_matrix           <- snakemake@input[["count_matrix"]]

compounds_lines_profiled   <- snakemake@output[["compounds_lines_profiled"]]
auc_models_candidates      <- snakemake@output[["auc_models_candidates"]]

## load tables
response_curves <- data.table::fread(response_curves) %>%
                  as.data.frame()

cell_lines_annotation <- data.table::fread(cell_lines_annotation) %>%
                         as.data.frame()

ccle_counts <- readRDS(count_matrix)

## Filter out LOW QC response curves
## MTS are technical redos of some assays
## Keep HTS for now, in the future maybe mix, as it seems MTS-10 is better
response_curves <- response_curves[!is.na(response_curves$auc),]

response_curves <- response_curves %>%
    filter(screen_id == "HTS002" & !is.na(auc) & r2 >= 0.7 & lower_limit <= 1)

## Get rid of cell lines and curves for which no RNASeq profiling was done
available_lines <- intersect(colnames(ccle_counts), response_curves$depmap_id)
response_curves <- response_curves[response_curves$depmap_id %in% available_lines, ]

## Get the nº. lines/compound and the CV
lines_and_compounds <- response_curves %>%
    group_by(broad_id) %>%
    summarise(
        profiled_lines = n_distinct(depmap_id),
         cv = sd(auc) / mean(auc),
    ) %>%
    as.data.frame()

write.csv(lines_and_compounds, file = compounds_lines_profiled, row.names=FALSE)

## Keep compounds with at least 10 profiled lines and CV >= 0.1
## Also, get rid of duplicated compund assays. Keep the one with the highest CV
compounds_to_test <- lines_and_compounds %>%
    filter(profiled_lines >= 10 & cv >= 0.1) %>%
    arrange(desc(cv)) %>%
    distinct(broad_id, .keep_all = TRUE) %>%
    pull(broad_id)  

response_curves   <- filter(response_curves, broad_id %in% compounds_to_test)

## Annotate cell line histology
response_curves <- merge(x=response_curves,
                         y=cell_lines_annotation[,c("DepMap_ID", "lineage")],
                         by.x="depmap_id",
                         by.y="DepMap_ID",
                         all.x=TRUE,
                         all.y=FALSE)

response_curves$lineage <- as.factor(response_curves$lineage)

if(!dir.exists(file.path(auc_models_candidates))){
    dir.create(auc_models_candidates)}

by(response_curves,
   response_curves$broad_id,
   FUN=function(i) write.csv(i, paste0(auc_models_candidates, "/", i$broad_id[1], ".csv"), , row.names=FALSE))

R tidyverse From line 1 of scripts/prism_generate_annotation.R

library("tidyverse")

### SNAKEMAKE I/O ###
compound_data        <- snakemake@input[['compound_data']]
lines_compounds      <- snakemake@input[['lines_compounds']]

comma_file <- snakemake@output[['csv_db']]
rdata      <- snakemake@output[['rdata_db']]

full_table <- compound_data %>%
    map(read_csv) %>%
    reduce(bind_rows)

## Annotate how many lines were profiled by comp.
lines_compounds <- read_csv(lines_compounds) 

## Now keep columns of interest
filtered_data <- full_table %>%
    select(
        broad_id,
        name,
        auc,
        ic50,
        depmap_id,
        lineage,
        moa,
        target,
        smiles
        ) %>%
    mutate(
         cid = str_remove_all(
            string = smiles,
            pattern = ",.*"
         )
    ) %>%
    left_join(y = lines_compounds, by = "broad_id")



write_csv(x = filtered_data, file = comma_file)
save(filtered_data, file = rdata)

R tidyverse From line 1 of scripts/prism_generate_drug_db.R

suppressMessages(library('edgeR'))
suppressMessages(library('limma'))

### SNAKEMAKE I/O ###
compound_to_test      <- snakemake@input[['compound_to_test']]
raw_gene_counts       <- snakemake@input[['raw_gene_counts']]

ebayes_model          <- snakemake@output[['ebayes']]

## Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


## Load and set data types manually just in case
ccle_counts           <- readRDS(raw_gene_counts)
compound_to_test      <- read.csv(compound_to_test)

compound_to_test$auc            <- as.numeric(compound_to_test$auc)
compound_to_test$lineage        <- as.factor(compound_to_test$lineage)

## Subset the counts
lines_to_test <- compound_to_test$depmap_id
count_matrix  <- ccle_counts[,lines_to_test]

rownames(compound_to_test) <- compound_to_test$depmap_id

## voom model
design <- model.matrix(~lineage + auc, data=compound_to_test)

## reorder count_matrix so that cols matches rows from design
count_matrix <- count_matrix[,rownames(design)]

dge  <- DGEList(counts=count_matrix)
keep <- filterByExpr(dge, design=design)

dge <- dge[keep, keep.lib.sizes=FALSE]
dge <- calcNormFactors(dge)

## TODO: save voom model plot
v <- voom(dge, plot=FALSE, normalize.method='quantile')

fit <- lmFit(v, design)
fit <- eBayes(fit)

saveRDS(fit, ebayes_model)

R limma From line 1 of scripts/prism_generate_ebayes_model.R

suppressMessages(library('tidyverse'))

### SNAKEMAKE I/O ###
raw_expected_counts <- snakemake@input[["raw_expected_counts"]]
ccle_default_line <- snakemake@input[["ccle_default_line"]]
protein_coding_genes <- snakemake@input[["protein_coding_genes"]]
raw_gene_counts <- snakemake@output[["raw_gene_counts"]]

### SNAKEMAKE PARAMS ###
coding_genes <- as.logical(snakemake@params["coding_genes_only"])

## Load CCLE raw counts
ccle_counts <- data.table::fread(raw_expected_counts)

## Load cell line equivalences
cell_lines <- data.table::fread(ccle_default_line)

## Keep RNA cell line equivalences
cell_lines <- cell_lines %>%
  filter(ProfileType == "rna") %>%
  rename(V1 = ProfileID) %>%
  select(V1, ModelID) %>%
  unique()

## Convert CCLE raw counts to a matrix where each gene is a row and each sample, 
## a column.
## Input is RSEM expected counts (thus I have to round up to units)
genes <- colnames(ccle_counts)[-1]
ccle_counts <- ccle_counts %>%
  merge(cell_lines, by = "V1") %>%
  column_to_rownames("ModelID") %>%
  select(-V1) %>%
  t() %>%
  as.data.frame()
rownames(ccle_counts) <- genes

## Most of the genes with NA values are ERCC-0000X spike-ins
ccle_counts <- na.omit(ccle_counts)

## BROAD format for gene names is HUGO (ENSEMBL)
ccle_counts <- ccle_counts %>%
  mutate(gene_variance = unname(apply(ccle_counts, MARGIN = 1, FUN = var)),
         Gene = str_remove(rownames(ccle_counts), pattern = " \\(.*"),
         Ensembl = str_extract(rownames(ccle_counts), 
                               pattern = "ENSG[0-9]+")) %>%
  ## Get rid of remaining spikes
  filter(!grepl(Gene, pattern = "ERCC-", fixed = TRUE))

## If coding_genes = TRUE, just keep protein coding genes
if (coding_genes) {
  hgnc_coding_genes <- data.table::fread(protein_coding_genes) %>%
    rename(Gene = symbol, Ensembl = ensembl_gene_id) %>%
    select(Gene, Ensembl) %>%
    unique()
  ccle_counts <- ccle_counts %>%
    select(-Gene) %>%
    merge(hgnc_coding_genes)
}

## Keep the most variable gene when two ENSEMBL ids point to the same HGNC
ccle_counts <- ccle_counts %>%
  group_by(Gene) %>%
  filter(gene_variance == max(gene_variance)) %>%
  column_to_rownames("Gene") %>%
  select(-gene_variance, -Ensembl)

## Round counts
ccle_counts <- round(as.matrix(ccle_counts), digits = 0)

## Save object
saveRDS(ccle_counts, file = raw_gene_counts)

R tidyverse RSEM From line 1 of scripts/prism_raw_counts_from_expected_counts.R

suppressMessages(library('GSEABase'))
suppressMessages(library('limma'))
suppressMessages(library('dplyr'))


## SNAKEMAKE I/O  ##
eBayes_model       <- snakemake@input[['fitted_bayes']]
drug_info          <- snakemake@input[['treatment_info']]

geneset_directory <- snakemake@output[['bidirectional_geneset']]

compound_id       <- snakemake@wildcards[['broad_id']]

## SNAKEMAKE PARAMS ##
signature_type    <- tolower(as.character(snakemake@params[['signature_type']]))

# Logging
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

## If signature_type == 'Classic',
## then we'll take the top/bottom 250 genes sorted by T.statistics
## If signature_type == 'fold', we'll perform FDR + log fold selection

generate_bidirectional_signature <- function(sig_name, deg_genes){

    ## Split the table between S_genes (negative logfold) 
    ## and R genes (positive)
    if(signature_type=='classic'){
        sensitivity_genes <- deg_genes[deg_genes$t <0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$t >0, 'ID']
    }else{
        sensitivity_genes <- deg_genes[deg_genes$logFC < 0, 'ID']
        resistance_genes  <- deg_genes[deg_genes$logFC > 0, 'ID']
    }
    if(length(sensitivity_genes) >= 15){
       candidate_genes <- extract_top_genes(deg_genes, 'sensitivity', signature_type=signature_type)
       sensitivity_gset <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
       setName(sensitivity_gset) <- paste(sig_name, 'UP', sep='_')

       GSEABase::toGmt(sensitivity_gset, con = paste(geneset_directory, '/', sig_name, '_UP', '.gmt', sep=''))
    }
    if(length(resistance_genes) >= 15){
        candidate_genes <- extract_top_genes(deg_genes, 'resistance', signature_type=signature_type)
        resistance_set <- GSEABase::GeneSet(candidate_genes, geneIdType=SymbolIdentifier())
        setName(resistance_set) <- paste(sig_name, 'DOWN', sep='_')

        GSEABase::toGmt(resistance_set, con = paste(geneset_directory, '/', sig_name, '_DOWN', '.gmt', sep='')) 
    }
}

#TODO: This function here is extremely ugly. Refactor it
extract_top_genes <- function(deg_genes, mode='sensitivity', signature_type='classic'){

    ## Extract and sort the top hits for the selected direction/mode
    if(mode=='sensitivity'){

        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(deg_genes$t), 'ID'], n=250)
        }else{
            deg_genes <- head(deg_genes[order(deg_genes$logFC), 'ID'], n=250)
        }
    }else{
        if(signature_type=='classic'){
            deg_genes <- head(deg_genes[order(-deg_genes$t), 'ID'], n=250)
        }else{
        deg_genes <- head(deg_genes[order(-deg_genes$logFC), 'ID'], n=250)
        }
    }
    return(deg_genes)
}


drug_info <- data.table::fread(drug_info) %>%
    as.data.frame()

drug_info <- drug_info[!duplicated(drug_info$broad_id),]
drug_info <- drug_info[drug_info$screen_id == 'HTS002',]

bayes_results <- readRDS(eBayes_model)

if(signature_type == 'classic'){
    all_genes <- topTable(bayes_results, coef='auc', number = Inf, adjust.method='fdr')
}else{ 
    ## Set a threshold of the adjusted p-value if signature_type is not classic
    all_genes <- topTable(bayes_results, coef = 'auc', 
                     number = Inf, adjust.method = 'fdr',
                     p.value=0.05)
}
# Set the genes as a column
all_genes$ID <- rownames(all_genes)

if(!dir.exists(file.path(geneset_directory))){
    dir.create(geneset_directory)}

## Check if we have at least 15 genes left after FDR-cutoff. Does not matter
## If both directions are < 15 individually, we'll account for that later.
## Just make sure we are not evaluating empty results 

if(nrow(all_genes) >= 15){

    common_name <- drug_info[drug_info$broad_id == compound_id, 'name']

    if(length(common_name) == 0){
        common_name <- compound_id}

    brd_split   <- strsplit(x=compound_id, split='-', fixed=TRUE)[[1]][2]

    sig_name <- paste(sep='_', common_name,'PRISM',brd_split)

    generate_bidirectional_signature(sig_name, all_genes)

}

R dplyr limma GSEABase From line 1 of scripts/prism_signature_from_ebayes.R

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressMessages(library("hgu219.db"))
suppressMessages(library("biomaRt"))
suppressMessages(library("dplyr"))

## CODE ##
httr::set_config(httr::config(ssl_verifypeer = FALSE))

# Get ENSEMBL IDs for each probe
probeIDs <- unlist(as.list(hgu219ENSEMBL))
probeIDs <- na.omit(data.frame(probe = names(probeIDs), ensembl = probeIDs, 
                               row.names = NULL))

# BiomaRt
mart <- useMart("ENSEMBL_MART_ENSEMBL")
human <- useDataset("hsapiens_gene_ensembl", mart)

# Get HUGO genes
hugoIDs <- getBM(c("hgnc_symbol", "ensembl_gene_id"), mart = human,
                 filters = "ensembl_gene_id", values = unique(probeIDs$ensembl))
colnames(hugoIDs) <- c("hgnc", "ensembl")
hugoIDs <- hugoIDs %>% mutate(hgnc = na_if(hgnc, "")) %>% na.omit

# Merge tables
merged <- merge(probeIDs, hugoIDs, by = "ensembl")
merged <- unique(merged[c("probe", "hgnc", "ensembl")])

# Save
write.table(merged, file = snakemake@output[["tsv"]], sep = "\t", 
            row.names = FALSE, quote = FALSE)

R dplyr hgu219.db biomaRt From line 1 of scripts/probeID_hgnc_table.R

shell:
    "cat {results}/ctrp/genesets/classic/*/*.gmt {results}/gdsc_rna/genesets/classic/*/*.gmt {results}/prism/genesets/classic/*/*.gmt > {output}"