SD divergence analysis for Vollger et al., 2023

public 1yr ago Version: v0.1 0 bookmarks

View Workflow

This is a snakemake for analyzing variants from assemblies aligned to CHM13 v1.1.

See run.sh for how to execute. However you will need the pre aligned sequences which you can find on Zenodo .

Dependencies are handled by snakemake and conda.

Code Snippets

shell:
    """
    bedtools sort -i {input.annotation_file} \
        | gzip -c > {output}
    """

SnakeMake BEDTools From line 14 of rules/distance.smk

shell:
    """
    NUM_COL=$(gunzip -c {input.dist} | head -n 1 | awk '{{print NF}}' || :)
    LAST_COL=$((NUM_COL+4))
    echo $NUM_COL, $LAST_COL 

    gunzip -c {input.windows} \
        | grep -v "^#" \
        | cut -f 1-3 \
        | bedtools closest -D b -t first \
            -a - \
            -b {input.dist} \
        | cut -f 1,2,3,$LAST_COL \
        > {output}
    """

SnakeMake BEDTools From line 35 of rules/distance.smk

shell:
    """
    HEADER=$(gunzip -c {input.windows} | head -n 1 || :)
    HEADER="${{HEADER}}\t{params.dist}"
    echo $HEADER

    paste \
        <(gunzip -c {input.windows} | grep -v "^#") \
        <(paste {input.distances} | cut -f {params.columns} ) \
        | sed "1s/^/${{HEADER}}\\n/" \
        | pigz -p {threads} \
        > {output}
    """

SnakeMake From line 72 of rules/distance.smk

shell:
    """
    HEADER=$(gunzip -c {input.windows} | head -n 1 || :)
    HEADER="${{HEADER}}\tnum_snv\thap"
    echo $HEADER

    gunzip -c {input.windows} \
        | grep "{wildcards.sm}_{wildcards.h}" \
        | bedtools coverage \
            -a - \
            -b <(csvtk filter -C "$" -tT -f "anno_TRF<1" {input.snv}) \
            -counts -sorted \
        | sed "s/$/\\t{wildcards.sm}_{wildcards.h}/" \
        | sed "1s/^/${{HEADER}}\\n/" \
        | csvtk cut -C "$" -tT -f -haps \
        | pigz -p {threads} \
        > {output}
    """

SnakeMake BEDTools csvtk From line 103 of rules/distance.smk

shell:
    """
    HEADER=$(zcat {input.snv[1]} | head -n 1 || :)
    echo $HEADER
    zcat {input.snv} \
        | sort -k 1,1 -k2,2n \
            --parallel={threads} -S 80G \
        | grep -v "^#" \
        | sed "1s/^/${{HEADER}}\\n/" \
        | pigz -p {threads} \
        > {output}
    """

SnakeMake From line 137 of rules/distance.smk

shell:
    """
    NUM_COL=$(gunzip -c {input.dist} | head -n 1 | awk '{{print NF}}' || :)
    LAST_COL=$((NUM_COL+4))
    echo $NUM_COL, $LAST_COL 

    gunzip -c {input.snv} \
        | cut -f 1-3 \
        | bedtools closest -D b -t first \
            -a - \
            -b {input.dist} \
        | cut -f 1-3,$LAST_COL \
        | bedtools sort -i - \
        > {output}
    """

SnakeMake BEDTools From line 168 of rules/distance.smk

shell:
    """
    HEADER=$(cat {input.snv} | head -n 1 || :)
    HEADER="${{HEADER}}\t{params.dist}"
    echo $HEADER

    paste \
        <(grep -v "^#" {input.snv}) \
        <(paste {input.distances} | grep -v "^#" | cut -f {params.columns} ) \
        | grep -v "^#" | sed "1s/^/${{HEADER}}\\n/" \
        | csvtk filter -C "$" -tT -f "anno_TRF<1" \
        > {output}
    """

SnakeMake csvtk From line 205 of rules/distance.smk

shell:
    """
    HEADER=$(head -n 1 {input.snv[1]} || :)
    echo $HEADER

    sort -m -k 1,1 -k2,2n {input.snv} \
        | grep -v "^#" \
        | sed "1s/^/${{HEADER}}\\n/" \
        | pigz -p {threads} \
        > {output}
    """

SnakeMake From line 230 of rules/distance.smk

    shell:
        """
        zcat {input} | cut -f 1-3,15,18- | pigz -p {threads} > {output}
        """


"""
df = pd.read_csv(input[0], sep="\t")
dist_col = [
col for col in df if col.startswith("dist_") or col.startswith("anno_")
]
names = ["#CHROM", "POS", "END", "REF", "ALT", "HAP", "SAMPLE"] + dist_col
df[names].to_csv(output[0], sep="\t", index=False, compression="gzip")
"""

SnakeMake From line 251 of rules/distance.smk

script:
    "../scripts/divergence_cum.R"

SnakeMake From line 21 of rules/plots.smk

script:
    "../scripts/called_regions.R"

SnakeMake From line 39 of rules/plots.smk

script:
    "../scripts/violin_plots.R"

SnakeMake From line 57 of rules/plots.smk

shell:
    """
    #bedtools intersect -a {input.callable} \
    #    -b <( bedtools sort -i {input.aln} | awk '$3 - $2 >= {params.min_syntenic_size}' ) 
    bedtools intersect -header -v -a {input.callable} -b {input.exclude} \
        | bedtools sort -i - \
        | bedtools merge -i - \
        | sed 's/$/\\t{wildcards.sm}_{wildcards.h}/g' \
        | gzip -c > {output}
    """

SnakeMake BEDTools From line 18 of rules/regions.smk

shell:
    """
    bedtools sort -i {input.annotation_file} \
        | bedtools merge -i - \
        | gzip -c > {output}
    """

SnakeMake BEDTools From line 49 of rules/regions.smk

shell:
    """
    bedtools intersect \
        -sorted \
        -a {input.bed} \
        -b {input.anno1} \
    | bedtools intersect \
        -sorted \
        -a - \
        -b {input.anno2} \
    | bedtools sort -i - \
    | bedtools merge -i - \
    | gzip -c \
    > {output}
    """

SnakeMake BEDTools From line 72 of rules/regions.smk

shell:
    """
    printf "{wildcards.sm}_{wildcards.h}\t" > {output}
    if [ {wildcards.anno1} == {wildcards.anno2} ]; then
        printf "{wildcards.anno1}_size\t" >> {output}
    else
        printf "{wildcards.anno1}_{wildcards.anno2}_size\t" >> {output}
    fi 
    awk '{{sum += $3 - $2}}END{{print sum}}' <(gunzip -c {input}) >> {output}
    """

SnakeMake From line 99 of rules/regions.smk

shell:
    """
    printf "hap\tanno\tsize\n" > {output}
    cat {input.combos} \
        | sed 's/gene-conversion-.*_size/IGC_size/g' \
        | sort >> {output}
    """

SnakeMake From line 138 of rules/regions.smk

run:
    df = pd.read_csv(str(input), sep="\t")
    out = df.pivot(index="hap", columns="anno", values="size")
    out.to_csv(str(output), sep="\t")

SnakeMake From line 155 of rules/regions.smk

script:
    "../scripts/explode_snv.py"

SnakeMake From line 13 of rules/snv.smk

shell:
    """
    gunzip -c {input.snv} \
        | awk -F$'\\t' -v pat="h{wildcards.h}|HAP" '$9 ~ pat' \
        | bedtools intersect -u \
            -a - \
            -b <(gunzip -c {input.callable}) \
            -header \
        | bedtools intersect -header -v -a - -b {input.exclude} \
        | bedtools sort -header -i - \
    | gzip -c > {output}
    """

SnakeMake BEDTools From line 29 of rules/snv.smk

script:
    "../scripts/implode_snv.py"

SnakeMake From line 56 of rules/snv.smk

shell:
    """
    HEADER=$(gunzip -c {input.snv} | head -n 1 || :)
    HEADER="${{HEADER}}\t{params.annotation_names}"
    echo $HEADER

    gunzip -c {input.snv} \
        | bedtools annotate -i - -counts \
            -files {input.annotation_files} \
        | bedtools sort -i - \
        | sed "1s/^/${{HEADER}}\\n/" \
        > {output}
    """

SnakeMake BEDTools From line 87 of rules/snv.smk

script:
    "../scripts/per_hap_kbp_stats.R"

SnakeMake From line 17 of rules/tables.smk

script:
    "../scripts/per_kbp_stats.R"

SnakeMake From line 40 of rules/tables.smk

script:
    "../scripts/make_r_data.R"

SnakeMake From line 58 of rules/tables.smk

shell:
    """
    csvtk cut -C "$" -tT \
        -f "#chr",start,end,num_snv  \
        {input.bed} \
        > {output.bg}

    bedGraphToBigWig {output.bg} {input.fai} {output.bigwig}
    """

SnakeMake bedGraphToBigWig csvtk From line 58 of rules/trackhub.smk

shell:
    """
    zcat {input.bed} \
        | csvtk -tT -C "$" cut -f "#chr",start,end,hap_count,num_snv \
        | bedtools merge -i - -d -{params.window_size} -c 4,5 -o distinct,sum \
        | awk -v OFS=$'\t' '$3-$2 >= {params.window_size} {{print $1,$2,$3,$5/$4}}' \
    > {output.bg}

    bedGraphToBigWig {output.bg} {input.fai} {output.bigwig}
    """

SnakeMake BEDTools bedGraphToBigWig csvtk From line 83 of rules/trackhub.smk

run:
    strong_color = "175,4,4"
    weak_color = "47,79,79"
    out = open(output.track, "w")
    out.write(track_db_header.format(window_size=config["window_size"]))
    for sm in ["all"] + list(tbl.index):
        for h in [1, 2]:
            if sm == "all" and h == 2:
                continue
            out.write(
                track.format(
                    sm=sm, h=h, strong_color=strong_color, weak_color=weak_color
                )
            )
    out.close()
    open(output.hub, "w").write(hub)
    open(output.genomes, "w").write(genomes)

SnakeMake From line 106 of rules/trackhub.smk

shell:
    """
    bedtools makewindows -g {input.fai} -w {params.window_size} -s {params.step_size} \
        | bedtools intersect -header -v -a - -b {input.exclude} \
        | bedtools sort -i - \
        | gzip -c > {output}
    """

SnakeMake BEDTools From line 15 of rules/windows.smk

shell:
    """
    NUM_COL=$(gunzip -c {input.windows} | head -n 1 | awk '{{print NF}}' || :)
    NAME_COL=$((NUM_COL+1))
    BOOL_COL=$((NUM_COL+2))
    echo $NAME_COL $BOOL_COL

    bedtools intersect -f 0.95 -C \
        -sorted -sortout \
        -a {input.windows} -b {input.beds} \
        -names {params.names} \
    | awk -v bool=$BOOL_COL '$bool != 0' \
    | bedtools merge -i - \
        -d {params.overlap} \
        -c $NAME_COL,$BOOL_COL -o distinct,sum \
    | sed '1s/^/#chr\\tstart\\tend\\thaps\\thap_count\\n/' \
    | gzip -c > {output}
    """

SnakeMake BEDTools From line 41 of rules/windows.smk

shell:
    """
    HEADER=$(gunzip -c {input.windows} | head -n 1 || :)
    HEADER="${{HEADER}}\t{params.annotation_names}"
    echo $HEADER

    bedtools annotate -i {input.windows} \
            -files {input.annotation_files} \
        | bedtools sort -i - \
        | sed "1s/^/${{HEADER}}\\n/" \
        | gzip -c > {output}
    """

SnakeMake BEDTools From line 78 of rules/windows.smk

run:
    df = pd.read_csv(str(input.bed), sep="\t")
    anno_cols = [col for col in df.columns if col.startswith("anno_")]
    df["region"] = "Other"
    for anno in anno_cols:
        df.loc[
            (df[anno] >= 0.95) & (df["region"] == "Other"), "region"
        ] = anno.strip("anno_")
    df.to_csv(output.bed, sep="\t", index=False)

SnakeMake From line 100 of rules/windows.smk

shell:
    """
    bedtools complement \
        -i <(bedtools merge -i {input.windows}) \
         -g <(cat {input.fai} | sort -k 1,1 -k2,2n ) \
        | gzip -c > {output}
    """

SnakeMake BEDTools From line 124 of rules/windows.smk

source("workflow/scripts/setup.R")

all.files <- Sys.glob("results/syntenic_and_callable/H*bed.gz")
all.files <- snakemake@input[["beds"]]
mylist <- lapply(all.files, fread)
df <- rbindlist(mylist, idcol = TRUE)
colnames(df) <- c("ID", "chr", "start", "end", "haplotype")

p <- df %>%
    group_by(haplotype) %>%
    summarize(Mbp = sum(end - start) / 1e6) %>%
    ggplot(aes(x = Mbp, y = haplotype, label = comma(Mbp))) +
    geom_bar(stat = "identity") +
    geom_vline(xintercept = 3100, color = RED, linetype = "dashed", size = 1) +
    geom_text(hjust = 0) +
    theme_cowplot() +
    ggtitle("Callable space for each haplotype assebmly",
        subtitle = "(Only regions with at least 1 Mbp of synteny considered)"
    )

# read in long windows
called_regions <- read_in_snv_windows(snakemake@input[["windows"]])
called_regions$region <- factor(called_regions$region, levels = names(COLORS))
p2 <- called_regions %>%
    group_by(hap, region) %>%
    do(get_num_bp(.)) %>%
    ggplot(aes(
        x = V1 / 1e6,
        y = hap,
        label = comma(round(V1 / 1e6)), fill = region
    )) +
    geom_bar(stat = "identity") +
    geom_vline(xintercept = 3100, color = RED, linetype = "dashed", size = 1) +
    geom_text(hjust = 0) +
    scale_fill_manual(values = COLORS) +
    facet_wrap(. ~ region, scales = "free") +
    ylab("") +
    xlab("Mbp") +
    theme_cowplot() +
    theme(legend.position = "top")

height <- length(unique(called_regions$hap)) / 2
fig <- p2
print(height)
ggsave(snakemake@output[[1]],
    plot = fig,
    width = 16, height = height,
    units = "in",
    limitsize = FALSE
)

R ggplot2 From line 1 of scripts/called_regions.R

source("workflow/scripts/setup.R")

infile <- snakemake@input[[1]]
outfile <- snakemake@output[[1]]
outfile2 <- snakemake@output[[2]]

df <- read_in_snv_windows(infile)


chrX <- copy(df[df[["#chr"]] == "chrX"])
chrX$region <- "chrX"
df <- rbind(df, chrX)

pal <- COLORS[unique(df$region)]
df$region <- factor(df$region, levels = names(COLORS))

#
# make plot
#
fakeadd <- 0.005
plot.df <- df %>%
    filter(region %in% c("Unique", "SD", "chrX")) %>%
    data.table()
plot.df[per_div == 0]$per_div <- fakeadd

p <- ggplot() +
    stat_ecdf(
        data = plot.df,
        aes(per_div, color = region),
        size = 1.5, alpha = 0.75
    ) +
    scale_x_log10(
        limits = c(fakeadd, 20),
        breaks = c(fakeadd, 0.01, 0.1, 1, 10),
        labels = c("0.00", "0.01", "0.10", "1.00", "10.0")
    ) +
    annotation_logticks(sides = "b") +
    scale_fill_manual(values = pal) +
    scale_color_manual(values = pal) +
    xlab(glue("% divergence of 10 kbp windows (1 kbp slide)")) +
    ylab("Cumulative fraction of windows") +
    ggtitle("Divergence of 10 kbp windows aligned to T2T-CHM13 v1.1",
        subtitle = "(Minimum 1 Mbp alignment, SD windows are at least 95% SD)"
    ) +
    theme_cowplot() +
    theme(legend.position = "top", legend.title = element_blank()) +
    guides(fill = guide_legend(ncol = length(pal) / 2))
ggsave(outfile, width = 12, height = 8, plot = p)


#
# all haplotypes
#
plot.df2 <- plot.df %>%
    filter(region %in% c("SD", "Unique")) %>%
    data.table()

p2 <- p +
    stat_ecdf(
        data = plot.df2,
        aes(per_div, group = paste0(hap, region), color = region),
        alpha = 0.5,
        size = 0.1,
        linetype = "dashed"
    )
ggsave(outfile2, width = 12, height = 8, plot = p2)

R ggplot2 From line 1 of scripts/divergence_cum.R

import pandas as pd

df = pd.read_csv(snakemake.input.bed, sep="\t", low_memory=False)
print(f"All in the vcf:\t{df.shape[0]/1e6}M")
df = df[((df["TYPE"] == "SNV") | (df["TYPE"] == "SNP")) & (df["#CHROM"] != "chrY")]
print(f"SNP and not chrY:\t{df.shape[0]/1e6}M")

df = df.astype(str)
df["GT"] = df["GT"].str.strip()
df["HAP"] = df["HAP"].str.strip()
df = (
    df.apply(lambda col: col.str.split(";").explode())
    .reset_index()
    .reindex(df.columns, axis=1)
)

#     638  .|.
#     204  .|0
#      46  0|.
# 2383400  0|1
#   61908  .|1
#   68572  1|.
# 2393786  1|0
# 2325920  1|1
# fix the genotypes for expanded
df.loc[(df["HAP"] == "h1") & (df["GT"] == "1|1"), "GT"] = "1|."
df.loc[(df["HAP"] == "h2") & (df["GT"] == "1|1"), "GT"] = ".|1"

is_h1 = (df["HAP"] == "h1") & (
    (df["GT"] == "1|0") | (df["GT"] == "1|1") | (df["GT"] == "1|.")
)
is_h2 = (df["HAP"] == "h2") & (
    (df["GT"] == "0|1") | (df["GT"] == "1|1") | (df["GT"] == ".|1")
)

df = df[is_h1 | is_h2]

df["SAMPLE"] = snakemake.wildcards.sm
print(f"Final output exploded:\t{df.shape[0]/1e6}M")
df.to_csv(snakemake.output.snv, sep="\t", compression="gzip", index=False)

Python Pandas From line 1 of scripts/explode_snv.py

import pandas as pd

li = []
for filename in snakemake.input.beds:
    tdf = pd.read_csv(filename, index_col=None, sep="\t", low_memory=False)
    li.append(tdf)

df = pd.concat(li, axis=0, ignore_index=True)
df = df.astype(str)

id_cols = ["ID"]
if "SAMPLE" in df.columns:
    id_cols = ["ID", "SAMPLE"]

implode = (
    df.sort_values(by=["#CHROM", "POS", "ID", "HAP"])
    .groupby(id_cols)
    .agg(lambda x: ";".join(x.unique()))
    .reset_index()
)
implode.loc[(implode["GT"] == "1|.;.|1"), "GT"] = "1|1"

implode = implode.loc[:, df.columns]

implode.to_csv(snakemake.output.snv, sep="\t", compression="gzip", index=False)

Python Pandas From line 1 of scripts/implode_snv.py

library(tidyverse)
library(dplyr)
library(data.table)
library(tzdb)
library(vroom)

clean_bed_df <- function(df) {
    chrs <- paste("chr", c(1:22, "X", "Y", "M"), sep = "")
    colnames(df)[1:3] <- c("chr", "start", "end")
    df$chr <- factor(df$chr, levels = chrs)
    if ("chr1" %in% colnames(df)) {
        df$chr1 <- factor(df$chr1, levels = chrs)
    }
    if ("chr2" %in% colnames(df)) {
        df$chr2 <- factor(df$chr2, levels = chrs)
    }
    # remove dup cols
    df[, !duplicated(colnames(df)), with = FALSE]
}

read_bed_file <- function(infile, threads = 8) {
    # df <- data.table::fread(infile,
    #    nThread = threads,
    #    stringsAsFactors = TRUE,
    #    showProgress = TRUE
    # )
    clean_bed_df(
        data.table(
            vroom(infile, num_threads = threads)
        )
    )
}
longf <- "/Users/mrvollger/Desktop/EichlerVolumes/chm13_t2t/nobackups/sd-divergence/results/long_windows_with_snv_dist_annotation.bed.gz" # nolint

snvf <- "/Users/mrvollger/Desktop/EichlerVolumes/chm13_t2t/nobackups/sd-divergence/results/small_snv_exploded.bed.gz" # nolint

tblf <- "/Users/mrvollger/Desktop/EichlerVolumes/chm13_t2t/nobackups/sd-divergence/results/tables/snv_per_kbp.tbl" # nolint
dataf <- "results/image.Rdata"

longf <- snakemake@input$long
snvf <- snakemake@input$snv
tblf <- snakemake@input$tbl
dataf <- snakemake@output$rdata

cat("Reading windows")
df_w <- read_bed_file(longf)
save(df_w, file = snakemake@output$windows)

cat("Reading table")
df_tbl <- fread(tblf)
save(df_tbl, file = snakemake@output$tbl)

cat("Reading SNVs")
df_snv <- read_bed_file(snvf)
save(df_snv, file = snakemake@output$snv)


if (F) {
    outfile <- "~/Desktop/EichlerVolumes/chm13_t2t/nobackups/sd-divergence/misc_scripts/anno.snv_total.bed" # nolint
    df_w_snv <- df_w %>%
        group_by_at(setdiff(names(df_w), c("num_snv", "hap"))) %>%
        summarise(num_snv = sum(num_snv)) %>%
        data.table()

    write.table(df_w_snv,
        file = outfile,
        sep = "\t",
        row.names = FALSE,
        col.names = TRUE,
        quote = FALSE
    )
}

R tidyverse dplyr data.table vroom tzdb From line 1 of scripts/make_r_data.R

source("workflow/scripts/setup.R")
threads <- 8
threads <- snakemake@threads
outfile <- snakemake@output[[1]]
outfile2 <- snakemake@output[[2]]

snv_f <- "results/all_snv_exploded.bed.gz"
snv_f <- snakemake@input[[1]]
df <- fread(snv_f, showProgress = TRUE, nThread = threads) %>%
    mutate(hap = paste0(SAMPLE, gsub("h", "_", HAP))) %>%
    data.table()

#
# add paired annotation columns
#
anno_cols <- sort(names(df)[grepl("anno_", names(df))])
for (anno in anno_cols) {
    type <- gsub("anno_", "", anno)
    if (!(type %in% c("SD", "Unique"))) {
        new_col <- paste0("anno_SD+", type)
        print(new_col)
        df[[new_col]] <- 0
        df[(anno_SD == 1 & df[[anno]] == 1), new_col] <- 1
    }
}
# remove columns that wont be annotated
df <- df[rowSums(df[, ..anno_cols]) > 0]
# get new longer pairs
paired_anno_cols <- sort(names(df)[grepl("anno_", names(df))])

#
# make regions definitions
#
keep_cols <- c("hap", "ID", paired_anno_cols)
df <- df[anno_SD > 0 | anno_Unique > 0] %>%
    select(all_of(keep_cols)) %>%
    pivot_longer(all_of(paired_anno_cols),
        names_to = "region",
        names_prefix = "anno_"
    ) %>%
    filter(value > 0) %>%
    data.table()

#
# read in region sizes
#
sizes_f <- "results/annotation/annotation_sizes.tbl"
sizes_f <- snakemake@input[[2]]
region_sizes <- fread(sizes_f) %>%
    mutate(region = gsub("_", "+", gsub("_size", "", anno))) %>%
    data.table()

out_df <- df %>%
    merge(region_sizes, by = c("hap", "region")) %>%
    # , allow.cartesian = TRUE) %>%
    group_by(hap, region) %>%
    summarize(
        "# SNVs" = n(),
        Mbp = unique(size) / 1e6,
        "# SNVs per 10 kbp" = 1e4 * n() / unique(size),
    )

out_df_wide <- out_df %>%
    pivot_wider(
        names_from = region,
        values_from = c("# SNVs", "Mbp", "# SNVs per 10 kbp"),
        names_sort = TRUE,
        names_glue = "{region} {.value}",
    ) %>%
    select(order(colnames(.))) %>%
    data.table()

fwrite(out_df, outfile, sep = "\t", quote = FALSE, row.names = FALSE)
fwrite(out_df_wide, outfile2, sep = "\t", quote = FALSE, row.names = FALSE)

R From line 1 of scripts/per_hap_kbp_stats.R

source("workflow/scripts/setup.R")
threads <- 8
threads <- snakemake@threads
outfile <- snakemake@output[[1]]
outfile2 <- snakemake@output[[2]]
outfile3 <- snakemake@output[[3]]

all.files <- Sys.glob("results/tables/*/*kbp.tbl")
all.files <- snakemake@input$long
mylist <- lapply(all.files, fread)
df <- rbindlist(mylist)
df

wide_files <- Sys.glob("results/tables/*/*wide.tbl")
all.files <- snakemake@input$wide
wide_list <- lapply(wide_files, fread)
wide_df <- rbindlist(wide_list, fill = TRUE)
wide_df


fwrite(df, outfile, sep = "\t", quote = FALSE, row.names = FALSE)
fwrite(wide_df, outfile2, sep = "\t", quote = FALSE, row.names = FALSE)

dir.create(outfile3, showWarnings = FALSE)
fileConn <- file(paste0(outfile3, "/index.html"))
x <- kable(wide_df,
    format = "html",
    align = "l",
    digits = 2,
    caption = "Average divergence statistics"
)
write(x, fileConn)
close(fileConn)

R From line 1 of scripts/per_kbp_stats.R

source("workflow/scripts/setup.R")

infile <- "results/tables/snv_per_kbp.tbl"
infile2 <- ".test/hprc_year1_sample_metadata.txt"
outfile <- "~/Desktop/violin.pdf"

infile <- snakemake@input[[1]]
infile2 <- snakemake@input[[2]]
outfile <- snakemake@output[[1]]

pop <- fread(infile2, fill = TRUE)

df <- fread(infile) %>%
    filter(hap != "CHM1_2") %>%
    mutate(Sample = gsub("_(1|2)", "", hap)) %>%
    merge(pop, by = "Sample", all.x = T)

#
# make calculations minus IGC
#
if ("IGC" %in% df$region) {
    igc <- df[region == "IGC", ]
    sd <- copy(df[region == "SD", ])
    n_snvs <- sd$`# SNVs` - igc$`# SNVs`
    n_mbp <- sd$Mbp - igc$Mbp
    n_per_10_kbp <- 1e4 * n_snvs / (n_mbp * 1e6)
    sd$region <- "SD-IGC"
    sd$`# SNVs` <- n_snvs
    sd$`Mbp` <- n_mbp
    sd$`# SNVs per 10 kbp` <- n_per_10_kbp
    df <- rbind(df, sd)
}
if ("Acrocentric" %in% df$region) {
    acro <- df[region == "Acrocentric", ]
    sd <- copy(df[region == "SD", ])
    n_snvs <- sd$`# SNVs` - acro$`# SNVs`
    n_mbp <- sd$Mbp - acro$Mbp
    n_per_10_kbp <- 1e4 * n_snvs / (n_mbp * 1e6)
    sd$region <- "SD-Acrocentric"
    sd$`# SNVs` <- n_snvs
    sd$`Mbp` <- n_mbp
    sd$`# SNVs per 10 kbp` <- n_per_10_kbp
    df <- rbind(df, sd)
}


df$facet_row <- "SD"
df[region != "Unique"]$facet_row <- df[region != "Unique"]$region


new <- sort(unique(df$region[!(df$region %in% names(COLORS))]))
new_cols <- rep("gray", length(new)) # brewer.pal(length(new), "RdYlBu")
names(new_cols) <- new
pcolors <- c(COLORS, new_cols)

df$facet_row <- factor(df$facet_row, levels = names(pcolors))

pdf(outfile, height = 5, width = 9)
for (i in unique(df$facet_row)) {
    mbp <- round(df[df$region == i, "Mbp"], 2)
    gbp <- round(df[df$region == "Unique", "Mbp"] / 1000, 2)
    sdmbp <- round(df[df$region == "SD", "Mbp"], 2)
    title <- glue("Mbp of {i} considered {min(mbp)} - {max(mbp)} \n")
    subtitle <- glue("Mbp of SD considered {min(sdmbp)} - {max(sdmbp)}\n")
    subsub <- glue("Gbp of unique considered {min(gbp)} - {max(gbp)}")
    print(title)
    print(i)
    title <- paste(title, subtitle, subsub, sep = "\n")

    tdf <- df %>%
        filter(facet_row == i | region == "Unique" | region == "SD") %>%
        mutate(region = factor(region, levels = unique(c(i, "SD", "Unique"))))

    sumdf <- tdf %>%
        mutate(y = 0.9 * min(`# SNVs per 10 kbp`)) %>%
        group_by(region, Superpopulation, y) %>%
        summarize(
            label = round(median(`# SNVs per 10 kbp`), 1)
        )

    p <- tdf %>%
        ggplot(
            aes(
                x = region,
                y = `# SNVs per 10 kbp`,
                color = region,
                fill = region
            )
        ) +
        geom_text(data = sumdf, aes(label = label, y = y)) +
        geom_text_repel(
            data = tdf %>% filter(Sample == "CHM1xx"),
            aes(
                label = Sample,
                x = region,
                y = `# SNVs per 10 kbp`,
            ),
            color = "black",
            direction = "x",
            nudge_x = -1,
            arrow = arrow(length = unit(0.015, "npc")),
        ) +
        geom_violin(alpha = 0.5) +
        geom_jitter(width = 0.2, alpha = 0.75) +
        facet_row(~Superpopulation) +
        scale_x_discrete(guide = guide_axis(n.dodge = 3)) +
        # facet_grid(facet_row ~ Superpopulation, scales = "free") +
        # facet_grid_paginate(facet_row ~ Superpopulation,
        # nrow = 1,
        # ncol = length(unique(df$Superpopulation)),
        # page = length(unique(df$facet_row)),
        # page = i,
        # scales = "free"
        # ) +
        ggtitle("", subtitle = title) +
        scale_fill_manual(values = pcolors) +
        scale_color_manual(values = pcolors) +
        theme_minimal_hgrid() +
        theme(legend.position = "none") +
        xlab("Genomic region")
    print(p)
}
dev.off()