Run MedRemix with bedpe input as part of PLBR database workflow

public public 1yr ago Version: v0.2.0 0 bookmarks

Summary

  • Run MedRemix methylation profiler with bedpe input as part of PLBR database workflow.

  • Pipeline is an extension of the original MedRemix pipeline (https://github.com/pughlab/cfMeDIP-seq-analysis-pipeline)

  • Pipeline can take FASTQs, BAM or BEDPE as input.

  • For full specifications of BEDPE7+12 file format, please visit https://github.com/pughlab/bam2bedpe

  • Pipeline is written in Snakemake, designed to run on SLURM cluster, but can run locally as well.

  • Note:

    • .bam filtering is slightly different from original MedRemix pipeline, therefore results will differ slightly

    • however this pipeline will output same results whether using .bam or .bedpe as input.

Workflow overview

MedRemixBEDPE

Please see wiki for tutorial on settings up and running pipeline.

Questions please contact [email protected].

Code Snippets

18
19
shell:
    'bash src/bam2bedpe_scripts/bam2bedpe.sh -s {params.slurm_local} -c {params.chunks} -b {input} -o {params.out_dir} -a {params.conda_activate} -e {params.conda_env}'
30
31
shell:
    'bash src/bam2bedpe_scripts/get_clean_bedpe4medremix_v4_getopts.sh -i {input} -s {wildcards.sample} -p {wildcards.species} -c {wildcards.chrom} -f {params.frag_len_limit} -o {params.out_dir}'
47
48
shell:
    'Rscript src/R/row_bind_tables.R -p "{params.paths}" -o {output} --in-tsv --out-feather --omit-paths'
85
86
shell:
    'Rscript src/R/row_bind_tables.R -p "{params.paths}" -o {output} --in-tsv --out-feather --omit-paths'
78
79
shell:
    'samtools merge {output} {input} && samtools index {output}'
26
27
shell:
    'ln -s {input} {output} && samtools index {output}'
26
27
shell:
    'ln -s {input} {output}'
31
32
shell:
    'gunzip -dc {input} > {output}'
80
81
shell:
    'trim_galore --cores 4 --dont_gzip --paired {params.trimgalore_settings} --output_dir {params.outdir} {input.R1} {input.R2} && cp ' + path_to_data + '/{wildcards.cohort}/tmp/trim_fastq/{wildcards.sample}_lib{wildcards.lib}_extract_barcode_R*.fastq_trimming_report.txt ' + path_to_data + '/{wildcards.cohort}/qc/trim_galore'
16
17
shell:
    'fastqc --outdir {params.outdir} {input}'
27
28
shell:
    'samtools view {input} | grep -v "F19K16\|F24B22" | cat <(samtools view -H {input} | grep -v "F19K16\|F24B22") - | samtools view -b - > {output} && samtools index {output}'
41
42
shell:
    'fastqc --outdir {params.outdir} {input}'
52
53
shell:
    'samtools flagstat {input} > {output}'
66
67
shell:
    'Rscript src/QC/QC_MEDIPS.R --bamFile {input} --outputDir {params.outdir} --windowSize {params.winsize}'
SnakeMake From line 66 of rules/qc.smk
84
85
shell:
    'picard CollectGcBiasMetrics -I {input} -O {output.metric} -CHART {output.chart} -S {output.summary} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectGcBiasMetrics {output.summary} {output.summary}.parsed'
96
97
shell:
    'picard CollectInsertSizeMetrics -INCLUDE_DUPLICATES true -I {input} -O {output.metric} -H {output.histo} -M 0 -W 600 && bash src/QC/parse_picard_QC.sh picard.CollectInsertSizeMetrics {output.metric} {output.metric}.parsed'
108
109
shell:
    'picard MarkDuplicates -I {input} -O {output.markdup_bam} -M {output.metric} && bash src/QC/parse_picard_QC.sh picard.MarkDuplicates {output.metric} {output.metric}.parsed'
121
122
shell:
    'picard CollectAlignmentSummaryMetrics -I {input} -O {output.metric} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectAlignmentSummaryMetrics {output.metric} {output.metric}.parsed'
132
133
shell:
    'picard CollectQualityYieldMetrics -I {input} -O {output.metric} && bash src/QC/parse_picard_QC.sh picard.CollectQualityYieldMetrics {output.metric} {output.metric}.parsed'
163
164
shell:
    'bash src/QC/methyl_QC.sh {input} {output.methyl_counts} {output.methyl_summary}'
SnakeMake From line 163 of rules/qc.smk
190
191
shell:
    'cat {input.medips_qc} {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_markdup_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed > {output.full_qc} && rm {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_markdup_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed'
SnakeMake From line 190 of rules/qc.smk
24
25
shell:
    'touch {output}'
31
32
shell:
    'touch {output}'
38
39
shell:
    'touch {output}'
45
46
shell:
    'touch {output}'
52
53
shell:
    'touch {output}'
59
60
shell:
    'touch {output}'
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
usage(){
    echo 
    echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t"
    echo 
}
no_args="true"
KEEP_TMP=false

## Help 
Help()
{
    # Display Help
    echo 
    echo "Processes bam to bedpe in chunks, preserving FLAG, TLEN and CIGAR fields."
    echo
    echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t"
    echo "options:"
    echo "-h   [HELP]      print help"
    echo "-s   [REQUIRED]  type either 'slurm' or 'local', local is with nohup"
    echo "-c   [REQUIRED]  number of chunks to process in parallel"
    echo "-b   [REQUIRED]  path to bam input (full path)"
    echo "-o   [REQUIRED]  output directory (full path)"
    echo "-a   [REQUIRED]  full path to conda activate (e.g. /cluster/home/[username]/bin/miniconda3/bin/activate)"
    echo "-e   [REQUIRED]  conda env with pysam (e.g. pipeline-medremixBEDPE)"
    echo "-t   [OPTIONAL]  keep tmp_dir"
    echo
}

## Get the options
while getopts ":hs:c:b:o:a:e:t" option; do
    case "${option}" in
        h) Help
           exit;;
        s) SLURMLOCAL=${OPTARG};;
        c) NUM_OF_CHUNKS=${OPTARG};;
        b) INPUT_BAM_PATH=${OPTARG};;
        o) OUT_DIR=${OPTARG};;
        a) CONDA_ACTIVATE=${OPTARG};;
        e) CONDA_ENV=${OPTARG};;
        t) KEEP_TMP=true;;
       \?) echo "Error: Invalid option"
           exit;;
    esac
    no_args="false"
done

[[ "$no_args" == "true" ]] && { usage; exit 1; }

echo "Processing step8_bam2bedpe..."
echo "number of chunks: $NUM_OF_CHUNKS"
echo "input bam path:   $INPUT_BAM_PATH"
echo "output path:      $OUT_DIR"
echo "processing on:    $SLURMLOCAL"

# Main program ##############################################

echo "Job started at "$(date) 
time1=$(date +%s)

#PY_SCRIPT_DIR=${SRC_DIR}
## pwd is workflow/ if using Snakemake
PY_SCRIPT_PATH="$(pwd)/src/bam2bedpe_scripts/bam2bedpe_pysam_v5_wCIGAR.py"
INPUT_DIR="${INPUT_BAM_PATH%/*}"
INPUT_BAM="${INPUT_BAM_PATH##*/}"

TMP_DIR="${OUT_DIR}/tmp_dir/${INPUT_BAM%.*}"
mkdir -p $TMP_DIR

OUT_FRAG_NAMES="${INPUT_BAM%.*}.fragNames"
OUT_MERGED_SORTD_BEDPE="${INPUT_BAM%.*}_coordSortd.bedpe"

## -------------------------------------- ##
## get fragNames
samtools view ${INPUT_DIR}/${INPUT_BAM} \
    | cut -f1 \
    | awk '{ a[$1]++ } END { for (b in a) { print b } }' \
    > ${TMP_DIR}/${OUT_FRAG_NAMES}

## -------------------------------------- ##
## split bam into chunks
NUM_ROWS=$(cat ${TMP_DIR}/${OUT_FRAG_NAMES} | wc -l)
CHUNK_SIZE=$(( $NUM_ROWS / $NUM_OF_CHUNKS ))   ## won't have decimal, sometimes will be short, last chunk has to go to last row
echo ".bam has $NUM_ROWS fragments."
echo "Number of chunks was set to ${NUM_OF_CHUNKS}."
echo "Each chunk will be $CHUNK_SIZE rows."

for ((i=0; i<=$(( $NUM_OF_CHUNKS - 2 )); i++)); do
    CHUNK=$(( i + 1 ))
    ROW_START=$(( i*CHUNK_SIZE + 1))
    ROW_END=$(( CHUNK*CHUNK_SIZE ))
    echo "Processing CHUNK${CHUNK}... starting with row ${ROW_START}, ending in row ${ROW_END}."
    sed -n "$ROW_START,$ROW_END p" ${TMP_DIR}/${OUT_FRAG_NAMES} \
        > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${CHUNK}

done

## since chunk might not be evenly divided, last chunk will just be to last row
ith=$(( $NUM_OF_CHUNKS - 1 ))
ROW_START=$(( ith*CHUNK_SIZE + 1 ))
echo "Processing CHUNK$NUM_OF_CHUNKS... starting with row ${ROW_START}, ending in row ${NUM_ROWS}."
sed -n "$ROW_START,$NUM_ROWS p" ${TMP_DIR}/${OUT_FRAG_NAMES} \
    > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${NUM_OF_CHUNKS}

## make log dir
mkdir -p ${OUT_DIR}/logs_slurm

## -------------------------------------- ##
## picard FilterSamReads on chunks
for CHUNK in ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK*; do
    echo "Picard FilterSamReads for ${CHUNK##*/}..."
    echo "Output is ${INPUT_BAM%.*}_${CHUNK##*.}.bam"

    echo "Writing out ${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh"
    ## write out sample sbatch script
    cat <<- EOF > "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" 
	#!/bin/bash
	#SBATCH -t 3-00:00:00
	#SBATCH -J bam2bedpe_chunks_${CHUNK##*.}
	#SBATCH -D ${OUT_DIR}/logs_slurm
	#SBATCH --mail-type=ALL
	#SBATCH [email protected]
	#SBATCH -p himem
	#SBATCH -c 1
	#SBATCH --mem=16G
	#SBATCH -o ./%j-%x.out
	#SBATCH -e ./%j-%x.err

	source ${CONDA_ACTIVATE} ${CONDA_ENV}
	#PICARD_DIR=${PICARD_DIR}

	echo "Job started at "\$(date) 
	time1=\$(date +%s)

	picard FilterSamReads \
	    -I ${INPUT_DIR}/${INPUT_BAM} \
	    -O ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \
	    -READ_LIST_FILE ${CHUNK} \
	    -FILTER includeReadList \
	    -WRITE_READS_FILES false \
	    -USE_JDK_DEFLATER true \
	    -USE_JDK_INFLATER true \

	python ${PY_SCRIPT_PATH} \
	    --sort_bam_by_qname \
	    --bam_input ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \
	    --bedpe_output ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bedpe"

	rm ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam"

	time2=\$(date +%s)
	echo "Job ended at "\$(date) 
	echo "Job took \$(((time2-time1)/3600)) hours \$((((time2-time1)%3600)/60)) minutes \$(((time2-time1)%60)) seconds"
	EOF

    if [ $SLURMLOCAL == "slurm" ]; then
        sbatch "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh"
    elif [ $SLURMLOCAL == "local" ]; then
        nohup bash "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" &> "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.log" &
    fi

done

## periodically check if chunks have been written to completely
while true; do
    if [ $(find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" | wc -l) -eq $NUM_OF_CHUNKS ] 
    then
        echo "All bedpe chunks have been written to, merging bedpe chunks..."
        find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" -exec cat {} + \
                | sort -k1,1V -k2,2n -k3,3n -k5,5n -k6,6n \
                | gzip -c \
                > ${OUT_DIR}/${OUT_MERGED_SORTD_BEDPE}.gz
        break
    else
        echo "Sleeping for 5 minutes, then recheck if bedpe chunks were written to completely."
        sleep 5m
    fi
done

## remove TMP_DIR
if "$KEEP_TMP"; then
    echo "Keeping temp dir"
else
    echo "Removed temp dir after completion"
    rm -rf ${TMP_DIR}
fi


time2=$(date +%s)
echo "Job ended at "$(date) 
echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds"
echo ""
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
usage(){
    echo 
    echo "Usage: bash get_clean_bedpe4medremix.sh -i INPUT_PATH -s SAMPLE_NAME -p SPECIES -c CHR -f FRAGMENT_LENGTH_LIMIT -o OUT_DIR" 
    echo 
}
no_args="true"

## Help 
Help()
{ 
    # Display Help
    echo 
    echo "Get clean bedpe for input into MedRemixBEDPE."
    echo
    echo "Usage: bash get_clean_bedpe4medremix.sh -i INPUT_PATH -s SAMPLE_NAME -p SPECIES -c CHR -f FRAGMENT_LENGTH_LIMIT -o OUT_DIR"
    echo "options:"
    echo "-h   [HELP]      print help"
    echo "-i   [REQUIRED]  input file path for bedpe.gz (full path)"
    echo "-s   [REQUIRED]  sample name"
    echo "-p   [REQUIRED]  species (e.g. human, arabidopsis_F19K16, or arabidopsis_F24B22)"
    echo "-c   [REQUIRED]  chromosome to process (e.g. chr2)"
    echo "-f   [REQUIRED]  fragment length limit (e.g. 500)"
    echo "-o   [REQUIRED]  bedpe_bin_stats output directory (full path)"
    echo
}

## Get the options
while getopts ":hi:s:p:c:f:o:" option; do
    case "${option}" in
        h) Help
           exit;;
        i) INPUT_PATH=${OPTARG};;
        s) SAMPLE_NAME=${OPTARG};;
        p) SPECIES=${OPTARG};;
        c) CHR=${OPTARG};;
        f) FRAGMENT_LENGTH_LIMIT=${OPTARG};;
        o) OUT_DIR=${OPTARG};;
       \?) echo "Error: Invalid option"
           exit;;
    esac
    no_args="false"
done

[[ "$no_args" == "true" ]] && { usage; exit 1; }


# Main program ##############################################

echo "Processing get_clean_bedpe4medremix... " 
echo "Job started at "$(date) 
time1=$(date +%s)

INPUT_DIR=${INPUT_PATH%/*}
INPUT_BEDPE_GZ=${INPUT_PATH##*/}
BEDPE_CHR="${SAMPLE_NAME}_${SPECIES}_${CHR}.bedpe"
BEDPE_NOAMBI_NO113n177_mapQge20_isMate="${BEDPE_CHR%.*}.noAmbiguous.no113n177.mapQge20.isMate.bedpe"
BEDPE_lean="${BEDPE_NOAMBI_NO113n177_mapQge20_isMate%.*}.lean"
BEDPE_lean_withEnd="${BEDPE_lean}.withEnd"
BEDPE_lenFiltd="${BEDPE_lean_withEnd}.lenFiltd.bedpe4medremix"

## get specific chr whether as mate1 or mate2
zcat ${INPUT_DIR}/${INPUT_BEDPE_GZ} \
    | awk -v CHR=$CHR '($1 == CHR) || ($4 == CHR) {print}' \
    > ${OUT_DIR}/${BEDPE_CHR}

## 4 steps filtering:
## remove 'ambiguous reads' as defined by Rsamtools
  ## i.e. remove any .bedpe alignment if flag_mate1or2==SUPP && chr_mate1==chr_mate2
  ## ( chr_mate1!=chr_mate2 supp reads dealt with separately )
## remove 'FLAG 113,177' tandem reads
## remove fragments if either mate's MAPQ < 20
## remove unmated alignments 
  ## ( as defined by Rsamtools: read1-read2; both secondary, or none; no wrongly oriented alignments )
cat ${OUT_DIR}/${BEDPE_CHR} \
    | awk '((and($14,0x800) || and($15,0x800)) && $1==$4) {next} {print}' \
    | awk '(($14 == 113 && $15 == 177) || ($14 == 177 && $15 == 113)) {next} {print}' \
    | awk '($8>=20 && $9>=20) {print}' \
    | awk '(and($14,0x40) && and($15,0x80)) ||
           (and($14,0x80) && and($15,0x40)) {print}' \
    | awk '(!and($14,0x100) && !and($15, 0x100)) ||
           (and($14,0x100) && and($15, 0x100)) {print}' \
    | awk '(and($14,0x10) && and($15,0x20)) ||
           (and($14,0x20) && and($15,0x10)) {print}' \
    > ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate}

## get lean bedpe for medremix and paste with qwidth for mate1 and mate2
## { qwidth calculated ignoring DHNPXB in CIGAR, as defined by Rsamtools }
paste \
<(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \
    | awk 'BEGIN{OFS="\t"}{print $1, $7, $2+1, $5+1, ($8+$9)/2, $10, $11}') \
<(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \
    | cut -f12 \
 	| awk '{gsub(/[1-9][0-9]*[D|H|N|P|X|B]/,"",$1); print $1}' \
    | awk 'BEGIN{OFS=""}{gsub(/[A-Z]/,"+",$1); print $1,0}' \
    | bc) \
<(cat ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate} \
    | cut -f13 \
    | awk '{gsub(/[1-9][0-9]*[D|H|N|P|X|B]/,"",$1); print $1}' \
    | awk 'BEGIN{OFS=""}{gsub(/[A-Z]/,"+",$1); print $1,0}' \
    | bc) \
> ${OUT_DIR}/${BEDPE_lean}

## use qwidth to calculate 'end'
cat ${OUT_DIR}/${BEDPE_lean} \
    | awk 'BEGIN{OFS="\t"} ($6=="+") {print $1,$2, $3,$4, $5, $6,$7, $8,$9}
                           ($6=="-") {print $1,$2, $4,$3, $5, $7,$6, $9,$8}' \
    | awk 'BEGIN{OFS="\t"} {print $1,$2, $6":"$3"-"$3+$8","$7":"$4"-"$4+$9,
                                  $3, $4+$9, $4+$9-$3, $5}' \
    > ${OUT_DIR}/${BEDPE_lean_withEnd}

## filter by FRAGMENT_LENGTH_LIMIT
cat ${OUT_DIR}/${BEDPE_lean_withEnd} \
    | awk -v FRAGMENT_LENGTH_LIMIT=$FRAGMENT_LENGTH_LIMIT 'BEGIN{OFS="\t"} {if(($6 > 0) && ($6 < FRAGMENT_LENGTH_LIMIT)) {print}}' \
    > ${OUT_DIR}/${BEDPE_lenFiltd}

## remove intermediate files
rm ${OUT_DIR}/${BEDPE_CHR}
rm ${OUT_DIR}/${BEDPE_NOAMBI_NO113n177_mapQge20_isMate}
rm ${OUT_DIR}/${BEDPE_lean}
rm ${OUT_DIR}/${BEDPE_lean_withEnd}



time2=$(date +%s)
echo "Job ended at "$(date) 
echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds"
echo ""
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
SEQMETH="F19K16"
SEQUMETH="F24B22"

BAM_PATH=$1
METH_COUNT=$2
METH_QC=$3

samtools view ${BAM_PATH} | \
    cut -f 3 | sort | uniq -c | sort -nr | \
    sed -e 's/^ *//;s/ /\t/' | \
    awk 'OFS="\t" {print $2,$1}' | \
    sort -n -k1,1 \
    > ${METH_COUNT}

total=$(samtools view -c ${BAM_PATH})
unmap=$(cat ${METH_COUNT} | grep '^\*' | cut -f2); if [[ -z $unmap ]]; then unmap="0"; fi
methyl=$(cat ${METH_COUNT} | grep ${SEQMETH} | cut -f2); if [[ -z $methyl ]]; then methyl="0"; fi
unmeth=$(cat ${METH_COUNT} | grep ${SEQUMETH} | cut -f2); if [[ -z $unmeth ]]; then unmeth="0"; fi
pct_meth_ctrl=$(echo "scale=5; ($methyl + $unmeth)/$total * 100" | bc -l); if [[ -z $pct_meth_ctrl ]]; then pct_meth_ctrl="0"; fi
pct_bac_meth=$(echo "scale=5; $methyl/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_meth ]]; then pct_bac_meth="0"; fi
pct_bac_unmeth=$(echo "scale=5; $unmeth/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_unmeth ]]; then pct_bac_unmeth="0"; fi
methyl_specificity=$(echo "scale=5; 100 - $pct_bac_unmeth/$pct_bac_meth" | bc -l); if [[ -z $methyl_specificity ]]; then methyl_specificity="0"; fi
bet_meth_ctrl=$(echo "scale=5; $methyl/($methyl + $unmeth)" | bc -l); if [[ -z $bet_meth_ctrl ]]; then bet_meth_ctrl="0"; fi

echo -e "Total_reads\tUnmapped_reads\tNum_reads_align_methyl_F19K16\tNum_reads_align_unmethyl_F24B22\tPctg_reads_aligned_to_BACs\tPctg_BACs_is_methyl_F19K16\tPctg_BACs_is_unmethyl_F24B22\tMethylation_specificity\tMethylation_beta" > ${METH_QC}
echo -e "$total\t$unmap\t$methyl\t$unmeth\t$pct_meth_ctrl\t$pct_bac_meth\t$pct_bac_unmeth\t$methyl_specificity\t$bet_meth_ctrl" >> ${METH_QC}

cat ${METH_QC} \
    | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i
                    max=(max<NF?NF:max)}
                    END {for (i=1; i<=max; i++)
                            {for (j=1; j<=NR; j++) 
                             printf "%s%s", a[i,j], (j==NR?RS:FS)
                            }
                        }' \
    | awk 'BEGIN {OFS="\t"} $1="F19K16_F24B22_methyl_qc\t"$1' \
    > ${METH_QC}.parsed
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
METRIC=$1

cat $2 \
    | grep -A10 "## METRICS CLASS" \
    | grep -v "## METRICS CLASS" \
    | sed '/## HISTOGRAM/,+10d' \
    | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i
                    max=(max<NF?NF:max)}
                    END {for (i=1; i<=max; i++)
                            {for (j=1; j<=NR; j++) 
                             printf "%s%s", a[i,j], (j==NR?RS:FS)
                            }
                        }' \
    | grep -v "^SAMPLE\|^LIBRARY\|^READ_GROUP" \
    | grep -v "ACCUMULATION_LEVEL" \
    | awk -v METRIC=$METRIC 'BEGIN {OFS="\t"} $1=METRIC"\t"$1' \
    > $3
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
library(docopt)
## adapted from https://github.com/oicr-gsi/wf_cfmedip/blob/master/workflow/runMedips/runMedips.r

doc <- "Get MEDIPS QC metrics.
Usage:
    QC_MEDIPS.R --bamFile <FILE> --outputDir <DIR> --windowSize <SIZE> [ --genome <GENOME> ]

Options:
    --bamFile FILE       Aligned, sorted, filtered reads (bam)
    --outputDir DIR      Path to output folder
    --windowSize SIZE    Size of genomic windows (bp, e.g. 300)
    --genome GENOME      Path to a folder containing a custom BSgenome as a package, 
                             which will be loaded using devtools::load_all();
                             or the name of BSgenome (usually BSgenome.Hsapiens.UCSC.hg38
                             or BSgenome.Athaliana.TAIR.TAIR9)
    --help               show this help text
"
opt <- docopt(doc)

library(tidyverse)
library(gtools)
library(MEDIPS)
library(BSgenome.Hsapiens.UCSC.hg38)

#if (file.exists(paste(opt[['genome']], 'DESCRIPTION', sep='/'))) {
#    devtools::load_all(opt[['genome']])
#    bsgenome <- getBSgenome(basename(opt[['genome']]))
#} else {
#    bsgenome <- getBSgenome(opt[['genome']])
#}

if (!file.exists(opt$bamFile)){
  stop(paste0("ERROR: bam file not found ", opt$bamFile), call.=FALSE)
}

## get user parameters
bam_file = opt$bamFile
ws = as.numeric(opt$windowSize)
out_dir = paste0(opt$outputDir, "/")
chr.select=paste0("chr", c(1:22,"X","Y"))
#chr.select=c("chr1","chr22")

BSgenome="BSgenome.Hsapiens.UCSC.hg38"
uniq = 0 ## WARNING: default settings normalize the data, must be set to 0 to disable this transformation
extend = 0 ## relevant for single-end: https://support.bioconductor.org/p/81098/
shift = 0
paired = TRUE

# disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10)
options(scipen = 999)

## create MEDIPS set ###################################
message("Processing MEDIPS QC metrics: window size: ", ws)
MeDIPset =
  MEDIPS.createSet(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   window_size = ws,
                   chr.select = chr.select)

fname <- unlist(strsplit(basename(bam_file),split="\\."))[1]
# fname

## coupling set: maps CG densities across the genome
CS = MEDIPS.couplingVector(pattern="CG", refObj=MeDIPset)

## saturation analysis #################################
## whether a given set of mapped reads is sufficient to generate a saturated and reproducible coverage profile
## calculates Pearson correlation of coverage profile between exclusive sets A and B from a sample

sr =
  MEDIPS.saturation(file = bam_file,
                    BSgenome = BSgenome,
                    uniq = uniq,
                    extend = extend,
                    shift = shift,
                    paired = paired,
                    window_size = ws,
                    chr.select = chr.select,
                    nit = 10, nrit = 1, empty_bins = TRUE, rank = FALSE)
print(paste0("Estimated correlation is: ", round(sr$maxEstCor[2], 5)))
print(paste0("True correlation is: ", round(sr$maxTruCor[2],5)))

pdf(paste0(out_dir, fname, ".MEDIPS.SaturationPlot.pdf"), width = 5, height = 4)
MEDIPS.plotSaturation(sr)
dev.off()

## sequence coverage analysis ##########################
## outputs #of CpGs covered/not covered in a sample

cr =
  MEDIPS.seqCoverage(file = bam_file,
                     pattern = "CG",
                     BSgenome = BSgenome,
                     uniq = uniq,
                     extend = extend,
                     shift = shift,
                     paired = paired,
                     chr.select = chr.select)
print(paste0("Total number of reads: ", cr$numberReads))
print(paste0("Number of reads NOT covering a CpG: ", cr$numberReadsWO))
print(paste0("Fraction of reads NOT covering a CpG: ", round(cr$numberReadsWO / cr$numberReads, 5)))

print(paste0("Number of CpGs in reference: ", length(cr$cov.res)))
print(paste0("Number of CpG not covered by a read: ", length(cr$cov.res[cr$cov.res < 1])))
print(paste0("Number of CpG covered by 1 read: ", length(cr$cov.res[cr$cov.res == 1])))
print(paste0("Number of CpG covered by 2 reads: ", length(cr$cov.res[cr$cov.res == 2])))
print(paste0("Number of CpG covered by 3 reads: ", length(cr$cov.res[cr$cov.res == 3])))
print(paste0("Number of CpG covered by 4 reads: ", length(cr$cov.res[cr$cov.res == 4])))
print(paste0("Number of CpG covered by 5 reads: ", length(cr$cov.res[cr$cov.res == 5])))
print(paste0("Number of CpG covered by >5 reads: ", length(cr$cov.res[cr$cov.res > 5])))

print(paste0("Fraction of CpG not covered by a read: ", round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 1 read: ", round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 2 reads: ", round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 3 reads: ", round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 4 reads: ", round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 5 reads: ", round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by >5 reads: ", round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res),5)))


pdf(paste0(out_dir, fname, ".MEDIPS.seqCovPie.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="pie",
                       cov.level = c(0,1,2,3,4,5),
                       main="Sequence pattern coverage, pie chart")
dev.off()

pdf(paste0(out_dir, fname, ".MEDIPS.seqCovHist.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="hist",
                       t = 15,
                       main="Sequence pattern coverage, histogram")
dev.off()

## CpG enrichment #####################################
## test CpG enrichment of given set of short reads covering a set of genomic regions vs reference genome
## regions.relH - relative freq of CpGs within a sample's immunoprecipitated regions
## genome.relH - relative freq of CpGs within reference genome
## enrichment.score.relH - regions.relH/genome.relH
## regions.GoGe - obs/exp ratio of CpGs within a sample's immunoprecipitated regions
## genome.GoGe - obs/exp ratio of CpGs within genomic regions
## enrichment.score.GoGe - regions.GoGe/genome.GoGe
## (relH and GoGe = 2 different ways of calculating enrichment)


## original MEDIPS.CpGenrich has IRanges issue
## this is adapted from script by Nick Cheng
MEDIPS.CpGenrichNew <-
  function(file=NULL, BSgenome=NULL, extend=0, shift=0, uniq=1e-3, chr.select=NULL, paired=F){

    ## Proof correctness....
    if(is.null(BSgenome)){stop("Must specify a BSgenome library.")}

    ## Read region file
    fileName=unlist(strsplit(file, "/"))[length(unlist(strsplit(file, "/")))]
    path=paste(unlist(strsplit(file, "/"))[1:(length(unlist(strsplit(file, "/"))))-1], collapse="/")
    if(path==""){path=getwd()}
    if(!fileName%in%dir(path)){stop(paste("File", fileName, " not found in", path, sep =" "))}

    dataset = get(ls(paste("package:", BSgenome, sep = ""))[1])

    if(!paired){GRange.Reads = getGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}
    else{GRange.Reads = getPairedGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}

    ## Sort chromosomes
    if(length(unique(seqlevels(GRange.Reads)))>1){chromosomes=mixedsort(unique(seqlevels(GRange.Reads)))}
    if(length(unique(seqlevels(GRange.Reads)))==1){chromosomes=unique(seqlevels(GRange.Reads))}

    ## Get chromosome lengths for all chromosomes within data set.
    cat(paste("Loading chromosome lengths for ",BSgenome, "...\n", sep=""))

    chr_lengths=as.numeric(seqlengths(dataset)[chromosomes])

    ranges(GRange.Reads) <- restrict(ranges(GRange.Reads),+1)

    ##Calculate CpG density for regions
    total=length(chromosomes)
    cat("Calculating CpG density for given regions...\n")

    ## new code ##################################
    readsChars <- unlist(getSeq(dataset, GRange.Reads, as.character=TRUE))

    regions.CG = sum(vcountPattern("CG",readsChars))
    regions.C  = sum(vcountPattern("C",readsChars))
    regions.G  = sum(vcountPattern("G",readsChars))
    all.genomic= sum(width(readsChars))

    nReads <- length(readsChars)
    ###############################################

    regions.relH=as.numeric(regions.CG)/as.numeric(all.genomic)*100
    regions.GoGe=(as.numeric(regions.CG)*as.numeric(all.genomic))/(as.numeric(regions.C)*as.numeric(regions.G))

    cat(paste("Calculating CpG density for the reference genome",
              BSgenome, "...\n", sep = " "))

    CG <- DNAStringSet("CG")
    pdict0 <- PDict(CG)
    params <- new("BSParams", X = dataset, FUN = countPDict, simplify = TRUE, exclude = c("rand", "chrUn"))
    genome.CG=sum(bsapply(params, pdict = pdict0))
    params <- new("BSParams", X = dataset, FUN = alphabetFrequency, exclude = c("rand", "chrUn"), simplify=TRUE)
    alphabet=bsapply(params)
    genome.l=sum(as.numeric(alphabet))
    genome.C=as.numeric(sum(alphabet[2,]))
    genome.G=as.numeric(sum(alphabet[3,]))
    genome.relH=genome.CG/genome.l*100
    genome.GoGe=(genome.CG*genome.l)/(genome.C*genome.G);

    ##Calculate CpG density for reference genome

    enrichment.score.relH=regions.relH/genome.relH
    enrichment.score.GoGe=regions.GoGe/genome.GoGe

    gc()
    return(list(genome=BSgenome,
                regions.CG=regions.CG,
                regions.C=regions.C,
                regions.G=regions.G,
                regions.relH=regions.relH,
                regions.GoGe=regions.GoGe,
                genome.C=genome.C,
                genome.G=genome.G,
                genome.CG=genome.CG,
                genome.relH=genome.relH,
                genome.GoGe=genome.GoGe,
                enrichment.score.relH=enrichment.score.relH,
                enrichment.score.GoGe=enrichment.score.GoGe))
  }

er =
  MEDIPS.CpGenrichNew(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   chr.select = chr.select)

## medips.satr.est_cor and medips.satr.tru_cor involve randomness, will not give identical results on repeat runs
## rest of metrics should be identical on repeat runs

message("Writing out MEDIPS QC metrics: saturation, CpG coverage and CpG enrichment.")
QC_MEDIPS.df =
  data.frame(QC_type = rep("medips_QC", 33),
             metrics = c("ref_genome",
                         "satr.est_cor",
                         "satr.tru_cor",
                         "CpG_cov.totalNumReads",
                         "CpG_cov.numReadsWoCpG",
                         "CpG_cov.fracReadsWoCpG",
                         "CpG_cov.numCpGinRef",
                         "CpG_cov.numCpGwoReads",
                         "CpG_cov.numCpGw1read",
                         "CpG_cov.numCpGw2Reads",
                         "CpG_cov.numCpGw3Reads",
                         "CpG_cov.numCpGw4Reads",
                         "CpG_cov.numCpGw5Reads",
                         "CpG_cov.numCpGgt5Reads",
                         "CpG_cov.fracCpGwoReads",
                         "CpG_cov.fracCpGw1read",
                         "CpG_cov.fracCpGw2Reads",
                         "CpG_cov.fracCpGw3Reads",
                         "CpG_cov.fracCpGw4Reads",
                         "CpG_cov.fracCpGw5Reads",
                         "CpG_cov.fracCpGgt5Reads",
                         "enrich.regions.C",
                         "enrich.regions.G",
                         "enrich.regions.CG",
                         "enrich.genome.C",
                         "enrich.genome.G",
                         "enrich.genome.CG",
                         "enrich.regions.relH",
                         "enrich.genome.relH",
                         "enrich.regions.GoGe",
                         "enrich.genome.GoGe",
                         "enrich.enrichment.score.relH",
                         "enrich.enrichment.score.GoGe"),
             values = c(er$genome,
                        round(sr$maxEstCor[2], 5),
                        round(sr$maxTruCor[2], 5),
                        cr$numberReads,
                        cr$numberReadsWO,
                        round(cr$numberReadsWO / cr$numberReads, 5),
                        length(cr$cov.res),
                        length(cr$cov.res[cr$cov.res < 1]),
                        length(cr$cov.res[cr$cov.res == 1]),
                        length(cr$cov.res[cr$cov.res == 2]),
                        length(cr$cov.res[cr$cov.res == 3]),
                        length(cr$cov.res[cr$cov.res == 4]),
                        length(cr$cov.res[cr$cov.res == 5]),
                        length(cr$cov.res[cr$cov.res > 5]),
                        round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res), 5),
                        er$regions.C,
                        er$regions.G,
                        er$regions.CG,
                        er$genome.C,
                        er$genome.G,
                        er$genome.CG,
                        round(er$regions.relH, 5),
                        round(er$genome.relH, 5),
                        round(er$regions.GoGe, 5),
                        round(er$genome.GoGe, 5),
                        round(er$enrichment.score.relH, 5),
                        round(er$enrichment.score.GoGe, 5)))
names(QC_MEDIPS.df) = c("QC_type", "metrics", fname)
# QC_MEDIPS.df

write.table(QC_MEDIPS.df, paste0(out_dir, fname, "_QC_MEDIPS.csv"), row.names=F, quote=F, sep = '\t')
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
' row_bind_tables.R

Usage: row_bind_tables.R ( -p PATHS | -i INPUT | -G GLOB ) ( --in-csv | --in-tsv | --in-feather | --in-parquet) (--out-csv | --out-tsv | --out-feather | --out-parquet ) -o OUTPUT [ --omit-paths --trim-paths --index-col-name ICN ]

Options:
    -p --paths PATHS            Comma separated list of paths to TSV files
    -i --input INPUT            Path to a file containing paths to tables, one per line
    -G --glob GLOB              Globstring matching paths

    -o --output OUTPUT          Path to output

    --omit-paths                Exclude column listing the path to file
    --trim-paths                Includes only the file name instead of full path
    --index-col-name ICN        Name of the index column containing the paths. Defaults to "paths"
' -> doc

library(docopt)
library(plyr)
library(readr)
library(doParallel)
library(arrow)
args <- docopt(doc)
print(args)

if ( ! is.null(args[['paths']]) ) {
    paths = strsplit(args[['paths']], ',')[[1]]
} else if (! is.null(args[['input']])) {
    paths = readLines(args[['input']])
} else if ( ! is.null(args[['glob']]) ) {
    paths = Sys.glob(args[['glob']])
} else {
    stop('Must provide one of --paths, --input, or --glob.')
}

paths <- unique(paths)

message('Merging files')

registerDoParallel()

options(readr.show_progress = FALSE)

output <- ddply(data.frame(paths), 'paths', function(z) {
      path = as.character(z$paths)
      message(paste0('Reading file: ', path))

      if (args[['--in-tsv']]) {
          infile=suppressMessages(read_tsv(path))
      } else if (args[['--in-csv']]) {
          infile=suppressMessages(read_csv(path))
      } else if (args[['--in-feather']]) {
          infile=read_feather(path)
      } else if (args[['--in-parquet']]) {
          infile=read_parquet(path)
      } else {
          stop('Must provide an input file format')
      }
      return(infile)
}, .parallel = TRUE)

if (args[['--trim-paths']]) {
    output <- dplyr::mutate(output,
        paths = sapply(as.character(paths), function(p) { tail(strsplit(p, '/')[[1]], 1) })
    )
}

if (args[['--omit-paths']]) {
    output <- dplyr::select(output, -paths)
}

if (is.null(args[['--index-col-name']])) {
    icn = 'paths'
} else {
    icn = as.character(args[['--index-col-name']])
}
colnames(output)[colnames(output) == 'paths'] <- icn

message('Done merging')

if (args[['--out-tsv']]) {
    write_tsv(output, args[['output']])
} else if (args[['--out-csv']]) {
    write_csv(output, args[['output']])
} else if (args[['--out-feather']]) {
    write_feather(output, args[['output']])
} else if (args[['--out-parquet']]) {
    write_feather(output, args[['output']])
} else {
    stop('Must provide an output file format')
}

message(paste0('Output written to ', args[['output']]))
ShowHide 26 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/pughlab/PLBR_MedRemixBEDPE
Name: plbr_medremixbedpe
Version: v0.2.0
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...