Robust Optogenetic Inhibition with Red-light-sensitive Anion-conducting Channelrh

public public 1yr ago 0 bookmarks

ACRs

This repository hosts the bioinformatic analysis worklow used in Oppermann et al (2023) Robust Optogenetic Inhibition with Red-light-sensitive Anion-conducting Channelrhodopsins, bioRxiv .

The snakemake -based code for the workflow is under workflow/ . The dependencies are under conda control ( snakemake --use-conda ), see workflow/envs . The analysis files are under analysis/ , in particular ACRs/analysis/pdb contains the output files for the structural alignment ( sequences.aln is the main output) and analysis/all hosts the various files generated for the phylogenetic analysis:

  • sequences.fasta : unaligned sequences

  • cdhit.fasta : non-redundant set of sequences

  • mafft.fasta : mafft alignment

  • trimal.fasta : trimal trimmed alignment

  • iqtree.treefile : the phylogenetic tree from iqtree with ultrafasta bootstrap support values

The metadata for the sequences can be found in metadata/Channelrhodopsins_Updated_List.xlsx which is a snapshot of the Catalogue of Natural Channelrhodopsins .

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
library(dplyr)
library(tidyr)
library(bio3d)

template_file <- unlist(snakemake@input)
output_file <- unlist(snakemake@output)

templates <- read.table("pdb/template.txt", col.names = c("ID", "_P_", "fname")) %>%
    mutate(ID = substring(ID, 2))
data <- lapply(templates$fname, read.pdb) %>%
    setNames(templates$ID)
sheets <- lapply(data, `[[`, "sheet") %>%
    lapply(data.frame) %>%
    setNames(templates$ID) %>%
    bind_rows(.id = "ID") %>%
    mutate(sense = ifelse("sense" %in% names(.), as.numeric(sense), NA), sense = ifelse(sense < 0, "-", "+"))
helices <- lapply(data, `[[`, "helix") %>%
    lapply(data.frame) %>%
    setNames(templates$ID) %>%
    bind_rows(.id = "ID")
list(sheet = sheets, helix = helices) %>%
    bind_rows(.id = "ss") %>%
    filter(chain == "A") %>%
    replace_na(list(sense = ".")) %>%
    mutate(source = "SS", score = ".", frame = ".", attrib = ".") %>%
    select(ID, source, ss, start, end, score, sense, frame, attrib) %>%
    write.table(output_file, quote = F, sep = "\t", col.names = F, row.names = F)
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

template_txt = str(snakemake.input)
fasta_file = str(snakemake.output)

aminoacids = {
    "ALA": "A",
    "ARG": "R",
    "ASN": "N",
    "ASP": "D",
    "CYS": "C",
    "GLN": "Q",
    "GLU": "E",
    "GLY": "G",
    "HIS": "H",
    "ILE": "I",
    "LEU": "L",
    "LYS": "K",
    "MET": "M",
    "PHE": "F",
    "PRO": "P",
    "SER": "S",
    "THR": "T",
    "TRP": "W",
    "TYR": "Y",
    "VAL": "V",
    "LYR": "K"
}

def to_fasta(fname):
    has_seqres = False
    seq = []
    with open(fname) as fh:
        for line in fh:
            if line.startswith('SEQRES'):
                has_seqres = True
                tag, row_num, chain, length, *residues = line.split()
                if chain == 'A':
                    seq += [ aminoacids[x] for x in residues ]
            elif not has_seqres and line.startswith('ATOM'):
                tag, atom_num, atom, residue, chain, residue_num, *rest = line.split()
                if chain == 'A' and atom == 'N':
                    seq.append(aminoacids[residue])
    return ''.join(seq)

with open(fasta_file, 'w') as out:
    with open(template_txt) as fh:
        for line in fh:
            name, tag, pdb_file = line.split()
            seq = to_fasta(pdb_file)
            record = SeqRecord(Seq(seq), name[1:], description = '')
            SeqIO.write(record, out, 'fasta')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
library(dplyr)
library(tidyr)
library(treeio)
library(ggtree)
library(readxl)
library(photobiology)
library(ggplot2)
library(ggnewscale)
library(castor)
library(ape)

with(snakemake@input, {
    tree_file <<- tree
    metadata_file <<- metadata
})

output_file <- unlist(snakemake@output)

to_treedata <- function(tree) {
    class(tree) <- c("tbl_tree", "tbl_df", "tbl", "data.frame")
    as.treedata(tree)
}

add_hsp <- function(tree, colname) {
    colname <- deparse(substitute(colname))
    treedata <- to_treedata(tree)
    categories <- setNames(tree[[colname]], tree[["label"]]) %>%
        `[`(treedata@phylo$tip.label) %>%
        as.factor
    hsp <- hsp_max_parsimony(treedata@phylo, as.numeric(categories), edge_exponent = 0.1) %>%
        `$`("likelihoods") %>%
        as.data.frame %>%
        setNames(levels(categories)) %>%
        mutate(node = 1:n()) %>%
        gather(value, likelihood, -node) %>%
        filter(likelihood > 0.99) %>%
        setNames(c("node", paste0(colname, "_hsp"), paste0(colname, "_hsp_lh")))
    left_join(tree, hsp, by = "node")
}
get_mrca <- function(phylo, tips) {
    getMRCA(phylo, tips) %>%
        replace(is.null(.), NA)
}
add_mrca <- function(tree, colname) {
    colname <- deparse(substitute(colname))
    treedata <- to_treedata(tree)
    mrca <- mutate(tree, my_column = !!as.name(colname)) %>%
        group_by(my_column) %>%
        mutate(is.tip = label %in% treedata@phylo$tip.label) %>%
        mutate(no_data = all(is.na(my_column))) %>%
        mutate(mrca = get_mrca(treedata@phylo, node[is.tip])) %>%
        mutate(mrca = ifelse(no_data | is.na(mrca), node, mrca)) %>%
        group_by(mrca) %>%
        mutate(enough_tips = sum(is.tip) > 1) %>%
        mutate(ifelse(node == mrca & enough_tips, first(na.omit(my_column)), NA)) %>%
        pull
    tree[[paste0(colname, "_mrca")]] <- mrca
    return(tree)
}

metadata <- read_xlsx(metadata_file, .name_repair = "universal") %>%
    mutate(Sequence.name = sub(",.+", "", Sequence.name)) %>%
    mutate(Sequence.name = gsub("@", "_", Sequence.name)) %>%
    mutate(Maximum..nm = ifelse(is.na(Action.maximum..nm), Absorption.maximum..nm, Action.maximum..nm)) 

tree <- read.iqtree(tree_file) %>%
    as_tibble %>%
    left_join(metadata, by = c(label = "Sequence.name")) %>%
    mutate(Color = unname(w_length2rgb(Maximum..nm))) %>%
    mutate(Category = case_when(Currents %in% c("no photocurrents", "channel") ~ NA_character_, T ~ gsub("[][]", "", Currents))) %>%
    mutate(Symbol = gsub(",.+", "", Symbol)) %>%
    mutate(Symbol_show = ifelse(Currents.reference == "[Oppermann23](in_prep)", Symbol, NA)) %>%
    add_mrca(ChR.group) %>%
    add_hsp(Category)

cat_colors <- list(
    "anion channel" = "indianred",
    "cation channel" = "deepskyblue",
    "potassium channel" = "purple",
    "channel" = "yellow4"
)
p <- ggtree(to_treedata(tree), aes(color = Category_hsp), layout = "ape") +
    scale_color_manual(values = cat_colors) + new_scale_color() +
    geom_nodepoint(aes(x = branch.x, y = branch.y, subset = !is.na(UFboot) & UFboot >= 95), size = 0.2, color = "#4d4d4dff") +
    geom_nodepoint(aes(x = branch.x, y = branch.y, subset = !is.na(UFboot) & UFboot >= 90 & UFboot < 95), size = 0.2, color = "#b3b3b3ff") +
    geom_tippoint(aes(subset = !is.na(Category) & is.na(Color)), color = "darkgray") +
    geom_tippoint(aes(subset = !is.na(Color), color = Color)) + scale_colour_identity() + new_scale_color() +
    geom_tiplab2(aes(label = Symbol_show), hjust = -0.2) +
    geom_treescale(width = 0.5) +
    geom_cladelab(mapping = aes(subset = !is.na(ChR.group_mrca), node = node, label = ChR.group_mrca), offset = -0.1) +
    xlim(-10, 10)
ggsave(output_file, p, width = 7, height = 7)
2
3
4
5
6
7
8
9
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

with open(str(snakemake.output), 'w') as file:
    for name, seq in snakemake.params['seq'].items():
        record = SeqRecord(Seq(seq), id = name)
        SeqIO.write(record, file, 'fasta')
28
29
script:
    "scripts/write_fasta.py"
40
41
shell:
    "mafft --thread {threads} --auto {input} > {output}"
52
53
shell:
    "cdhit -i {input} -o {output} -c {params.c} -d 0"
64
65
shell:
    "trimal -in {input} -out {output} -gt {params.gt}"
74
75
shell:
    "gotree reroot midpoint -i {input} -o {output}"
92
93
shell:
    "iqtree2 -seed {params.seed} -s {input} -B {params.B} -T {threads} -pers {params.pers} -nstop {params.nstop} --prefix {params.prefix} -redo"
102
103
script:
    "scripts/pdb2fasta.py"
120
121
shell:
    "t_coffee {input.sequences} -outfile {output.aln} -newtree {output.dnd} -method {params.method} -template_file {input.template} -pdb_min_sim {params.pdb_min_sim} -pdb_min_cov {params.pdb_min_cov} -n_core {threads}"
131
132
script:
    "scripts/plot_tree.R"
141
142
script:
    "scripts/features.R"
ShowHide 8 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/BejaLab/ACRs
Name: acrs
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: GNU General Public License v3.0
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...