Run MeDEStrand with bedpe input as part of PLBR database workflow

public public 1yr ago Version: v0.2.0 0 bookmarks

Summary

  • Run MeDEStrand methylation profiler with bedpe input as part of PLBR database workflow.

  • Pipeline is an extension of OICR cfMeDIPseq analysis pipeline (https://github.com/oicr-gsi/wf_cfmedip)

    • same

Code Snippets

25
26
shell:
    'bash src/bam2bedpe_scripts/bam2bedpe.sh -s {params.slurm_local} -c {params.chunks} -b {input} -o {params.out_dir} -a {params.conda_activate} -e {params.conda_env}'
36
37
shell:
    'bash src/bam2bedpe_scripts/bedpe2leanbedpe_for_MeDEStrand.sh -i {input} -o {params.out_dir} -m {output}'
17
18
shell:
    'sh src/QC/getFilterMetrics.sh -r {input.bam_raw} -a {input.bam_filt1} -b {input.bam_filt2} -c {input.bam_filt3} -d {input.bam_dedupd} -o {output}'
17
18
shell:
    'bwa mem -t 8 {params.bwa_index} {input} > {output}'
29
30
shell:
    'samtools view -b {input} | samtools sort -o {output}'
50
51
shell:
    'samtools merge {output} {input} && samtools index {output}'
62
63
shell:
    'samtools view -b -F 260 {input} -o {output}'
75
76
shell:
    "samtools view {input} | awk 'sqrt($9*$9)>119 && sqrt($9*$9)<501' | awk '{{print $1}}' > {output.mpp_reads} && picard FilterSamReads -I {input} -O {output.filt2_bam} -READ_LIST_FILE {output.mpp_reads} -FILTER includeReadList -WRITE_READS_FILES false"
88
89
shell:
    "samtools view {input} | awk '{{read=$0;sub(/.*NM:i:/,X,$0);sub(/\t.*/,X,$0);if(int($0)>7) {{print read}}}}' | awk '{{print $1}}' > {output.HiMM_reads} && picard FilterSamReads -I {input} -O {output.filt3_bam} -READ_LIST_FILE {output.HiMM_reads} -FILTER excludeReadList -WRITE_READS_FILES false && samtools index {output.filt3_bam}"
101
102
shell:
    'umi_tools dedup --paired -I {input} -S {output.dedupd_bam}'
37
38
shell:
    'Rscript src/R/MeDEStrandBEDPE.R --inputFile {input} --outputFile {output} --windowSize {params.winsize} --chr_select "{params.chr_select}"'
26
27
shell:
    'ln -s {input} {output} && samtools index {output}'
26
27
shell:
    'ln -s {input} {output}'
40
41
shell:
    'umi_tools extract --extract-method=string --bc-pattern={params.umi_pat1} --bc-pattern2={params.umi_pat2} -I {input.R1} --read2-in={input.R2} -S {output.R1} --read2-out={output.R2}'
34
35
shell:
    'fastqc --outdir {params.outdir} {input}'
45
46
shell:
    'samtools view {input} | grep -v "F19K16\|F24B22" | cat <(samtools view -H {input} | grep -v "F19K16\|F24B22") - | samtools view -b - > {output} && samtools index {output}'
59
60
shell:
    'fastqc --outdir {params.outdir} {input}'
70
71
shell:
    'samtools flagstat {input} > {output}'
84
85
shell:
    'Rscript src/QC/QC_MEDIPS.R --bamFile {input} --outputDir {params.outdir} --windowSize {params.winsize}'
SnakeMake From line 84 of rules/qc.smk
102
103
shell:
    'picard CollectGcBiasMetrics -I {input} -O {output.metric} -CHART {output.chart} -S {output.summary} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectGcBiasMetrics {output.summary} {output.summary}.parsed'
114
115
shell:
    'picard CollectInsertSizeMetrics -INCLUDE_DUPLICATES true -I {input} -O {output.metric} -H {output.histo} -M 0 -W 600 && bash src/QC/parse_picard_QC.sh picard.CollectInsertSizeMetrics {output.metric} {output.metric}.parsed'
126
127
shell:
    'picard MarkDuplicates -I {input} -O {output.dedupd_bam} -M {output.metric} && bash src/QC/parse_picard_QC.sh picard.MarkDuplicates {output.metric} {output.metric}.parsed'
139
140
shell:
    'picard CollectAlignmentSummaryMetrics -I {input} -O {output.metric} -R {params.fasta} && bash src/QC/parse_picard_QC.sh picard.CollectAlignmentSummaryMetrics {output.metric} {output.metric}.parsed'
150
151
shell:
    'picard CollectQualityYieldMetrics -I {input} -O {output.metric} && bash src/QC/parse_picard_QC.sh picard.CollectQualityYieldMetrics {output.metric} {output.metric}.parsed'
181
182
shell:
    'bash src/QC/methyl_QC.sh {input} {output.methyl_counts} {output.methyl_summary}'
SnakeMake From line 181 of rules/qc.smk
208
209
shell:
    'cat {input.medips_qc} {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_dedupd_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed > {output.full_qc} && rm {input.F19K16_F24B22_methyl_summary}.parsed {input.picard_gcbias_summary}.parsed {input.picard_insertsize_metric}.parsed {input.picard_dedupd_metric}.parsed {input.picard_alignment_metric}.parsed {input.picard_quality_metric}.parsed'
SnakeMake From line 208 of rules/qc.smk
23
24
shell:
    'touch {output}'
30
31
shell:
    'touch {output}'
37
38
shell:
    'touch {output}'
44
45
shell:
    'touch {output}'
51
52
shell:
    'touch {output}'
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
usage(){
    echo 
    echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t"
    echo 
}
no_args="true"
KEEP_TMP=false

## Help 
Help()
{
    # Display Help
    echo 
    echo "Processes bam to bedpe in chunks, preserving FLAG, TLEN and CIGAR fields."
    echo
    echo "Usage: bash/sbatch bam2bedpe.sh -s [slurm|local] -c num_of_chunks -b bam_input_path -o output_dir -a CONDA -e ENV -t"
    echo "options:"
    echo "-h   [HELP]      print help"
    echo "-s   [REQUIRED]  type either 'slurm' or 'local', local is with nohup"
    echo "-c   [REQUIRED]  number of chunks to process in parallel"
    echo "-b   [REQUIRED]  path to bam input (full path)"
    echo "-o   [REQUIRED]  output directory (full path)"
    echo "-a   [REQUIRED]  full path to conda activate (e.g. /cluster/home/[username]/bin/miniconda3/bin/activate)"
    echo "-e   [REQUIRED]  conda env with pysam (e.g. pipeline-medremixBEDPE)"
    echo "-t   [OPTIONAL]  keep tmp_dir"
    echo
}

## Get the options
while getopts ":hs:c:b:o:a:e:t" option; do
    case "${option}" in
        h) Help
           exit;;
        s) SLURMLOCAL=${OPTARG};;
        c) NUM_OF_CHUNKS=${OPTARG};;
        b) INPUT_BAM_PATH=${OPTARG};;
        o) OUT_DIR=${OPTARG};;
        a) CONDA_ACTIVATE=${OPTARG};;
        e) CONDA_ENV=${OPTARG};;
        t) KEEP_TMP=true;;
       \?) echo "Error: Invalid option"
           exit;;
    esac
    no_args="false"
done

[[ "$no_args" == "true" ]] && { usage; exit 1; }

echo "Processing step8_bam2bedpe..."
echo "number of chunks: $NUM_OF_CHUNKS"
echo "input bam path:   $INPUT_BAM_PATH"
echo "output path:      $OUT_DIR"
echo "processing on:    $SLURMLOCAL"

# Main program ##############################################

echo "Job started at "$(date) 
time1=$(date +%s)

#PY_SCRIPT_DIR=${SRC_DIR}
## pwd is workflow/ if using Snakemake
PY_SCRIPT_PATH="$(pwd)/src/bam2bedpe_scripts/bam2bedpe_pysam_v5_wCIGAR.py"
INPUT_DIR="${INPUT_BAM_PATH%/*}"
INPUT_BAM="${INPUT_BAM_PATH##*/}"

TMP_DIR="${OUT_DIR}/tmp_dir/${INPUT_BAM%.*}"
mkdir -p $TMP_DIR

OUT_FRAG_NAMES="${INPUT_BAM%.*}.fragNames"
OUT_MERGED_SORTD_BEDPE="${INPUT_BAM%.*}_coordSortd.bedpe"

## -------------------------------------- ##
## get fragNames
samtools view ${INPUT_DIR}/${INPUT_BAM} \
    | cut -f1 \
    | awk '{ a[$1]++ } END { for (b in a) { print b } }' \
    > ${TMP_DIR}/${OUT_FRAG_NAMES}

## -------------------------------------- ##
## split bam into chunks
NUM_ROWS=$(cat ${TMP_DIR}/${OUT_FRAG_NAMES} | wc -l)
CHUNK_SIZE=$(( $NUM_ROWS / $NUM_OF_CHUNKS ))   ## won't have decimal, sometimes will be short, last chunk has to go to last row
echo ".bam has $NUM_ROWS fragments."
echo "Number of chunks was set to ${NUM_OF_CHUNKS}."
echo "Each chunk will be $CHUNK_SIZE rows."

for ((i=0; i<=$(( $NUM_OF_CHUNKS - 2 )); i++)); do
    CHUNK=$(( i + 1 ))
    ROW_START=$(( i*CHUNK_SIZE + 1))
    ROW_END=$(( CHUNK*CHUNK_SIZE ))
    echo "Processing CHUNK${CHUNK}... starting with row ${ROW_START}, ending in row ${ROW_END}."
    sed -n "$ROW_START,$ROW_END p" ${TMP_DIR}/${OUT_FRAG_NAMES} \
        > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${CHUNK}

done

## since chunk might not be evenly divided, last chunk will just be to last row
ith=$(( $NUM_OF_CHUNKS - 1 ))
ROW_START=$(( ith*CHUNK_SIZE + 1 ))
echo "Processing CHUNK$NUM_OF_CHUNKS... starting with row ${ROW_START}, ending in row ${NUM_ROWS}."
sed -n "$ROW_START,$NUM_ROWS p" ${TMP_DIR}/${OUT_FRAG_NAMES} \
    > ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK${NUM_OF_CHUNKS}

## make log dir
mkdir -p ${OUT_DIR}/logs_slurm

## -------------------------------------- ##
## picard FilterSamReads on chunks
for CHUNK in ${TMP_DIR}/${OUT_FRAG_NAMES}.CHUNK*; do
    echo "Picard FilterSamReads for ${CHUNK##*/}..."
    echo "Output is ${INPUT_BAM%.*}_${CHUNK##*.}.bam"

    echo "Writing out ${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh"
    ## write out sample sbatch script
    cat <<- EOF > "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" 
	#!/bin/bash
	#SBATCH -t 3-00:00:00
	#SBATCH -J bam2bedpe_chunks_${CHUNK##*.}
	#SBATCH -D ${OUT_DIR}/logs_slurm
	#SBATCH --mail-type=ALL
	#SBATCH [email protected]
	#SBATCH -p himem
	#SBATCH -c 1
	#SBATCH --mem=16G
	#SBATCH -o ./%j-%x.out
	#SBATCH -e ./%j-%x.err

	source ${CONDA_ACTIVATE} ${CONDA_ENV}
	#PICARD_DIR=${PICARD_DIR}

	echo "Job started at "\$(date) 
	time1=\$(date +%s)

	picard FilterSamReads \
	    -I ${INPUT_DIR}/${INPUT_BAM} \
	    -O ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \
	    -READ_LIST_FILE ${CHUNK} \
	    -FILTER includeReadList \
	    -WRITE_READS_FILES false \
	    -USE_JDK_DEFLATER true \
	    -USE_JDK_INFLATER true \

	python ${PY_SCRIPT_PATH} \
	    --sort_bam_by_qname \
	    --bam_input ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam" \
	    --bedpe_output ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bedpe"

	rm ${TMP_DIR}/"${INPUT_BAM%.*}_${CHUNK##*.}.bam"

	time2=\$(date +%s)
	echo "Job ended at "\$(date) 
	echo "Job took \$(((time2-time1)/3600)) hours \$((((time2-time1)%3600)/60)) minutes \$(((time2-time1)%60)) seconds"
	EOF

    if [ $SLURMLOCAL == "slurm" ]; then
        sbatch "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh"
    elif [ $SLURMLOCAL == "local" ]; then
        nohup bash "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.sh" &> "${TMP_DIR}/${INPUT_BAM%.*}_bam2bedpe_${CHUNK##*.}.log" &
    fi

done

## periodically check if chunks have been written to completely
while true; do
    if [ $(find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" | wc -l) -eq $NUM_OF_CHUNKS ] 
    then
        echo "All bedpe chunks have been written to, merging bedpe chunks..."
        find ${TMP_DIR} -maxdepth 1 -mmin +3 -type f -regex ".*_CHUNK[1-9][0-9]*\.bedpe" -exec cat {} + \
                | sort -k1,1V -k2,2n -k3,3n -k5,5n -k6,6n \
                | gzip -c \
                > ${OUT_DIR}/${OUT_MERGED_SORTD_BEDPE}.gz
        break
    else
        echo "Sleeping for 5 minutes, then recheck if bedpe chunks were written to completely."
        sleep 5m
    fi
done

## remove TMP_DIR
if "$KEEP_TMP"; then
    echo "Keeping temp dir"
else
    echo "Removed temp dir after completion"
    rm -rf ${TMP_DIR}
fi


time2=$(date +%s)
echo "Job ended at "$(date) 
echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds"
echo ""
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
usage(){
    echo 
    echo "Usage: bash bedpe2leanbedpe_for_MeDEStrand.sh -i INPUT_BEDPE -c [CHR_SELECT] -o OUT_DIR -m FOR_MEDE_OUT_FILE -s [KEEP_2NDARY_READS] -t [KEEP_TMP_DIR]" 
    echo 
}
no_args="true"
KEEP_TMP=false
KEEP_SECONDARY_READS=false

## Help 
Help()
{
    # Display Help
    echo 
    echo "Process .bedpe into a lean .bedpe for input into MeDEStrandBEDPE R package."
    echo
    echo "Usage: Usage: bash bedpe2leanbedpe_for_MeDEStrand.sh -i INPUT_BEDPE -c [CHR_SELECT] -o OUT_DIR -m FOR_MEDE_OUT_FILE -s [KEEP_2NDARY_READS] -t [KEEP_TMP_DIR]"
    echo "options:"
    echo "-h   [HELP]      print help"
    echo "-i   [REQUIRED]  input bedpe.gz (full path)" 
    echo "-c   [OPTIONAL]  chr select file (full path, if specified will select these chromosomes)"
    echo "-o   [REQUIRED]  output directory (full path)"
    echo "-m   [REQUIRED]  output file (full path)"
    echo "-s   [OPTIONAL]  keep secondary reads (false if no -s)"
    echo "-t   [OPTIONAL]  keep temporary directory (false if no -t)"
    echo
}

## function that allows optional argument
getopts_get_optional_argument() {
  eval next_token=\${$OPTIND}
  if [[ -n $next_token && $next_token != -* ]]; then
    OPTIND=$((OPTIND + 1))
    OPTARG=$next_token
  else
    OPTARG=""
  fi
}

## Get the options
while getopts ":hi:co:m:st" option; do
    case "${option}" in
        h) Help
           exit;;
        i) INPUT_BEDPE=${OPTARG};;
        c) getopts_get_optional_argument $@
           CHR_SELECT=${OPTARG};;
        o) OUT_DIR=${OPTARG};;
        m) OUT_FILE=${OPTARG};;
        s) KEEP_SECONDARY_READS=true;;
        t) KEEP_TMP=true;;
       \?) echo "Error: Invalid option"
           exit;;
    esac
    no_args="false"
done

[[ "$no_args" == "true" ]] && { usage; exit 1; }

echo "Processing step09_bedpe2leanbedpe_for_MeDEStrand..."
echo "input bedpe.gz path:       $INPUT_BEDPE"
echo "chromosome selection file: $CHR_SELECT"
echo "output path:               $OUT_DIR"
echo "keep secondary reads?      $KEEP_SECONDARY_READS"
echo "keep temporary directory?  $KEEP_TMP"

# Main program ##############################################

echo ""
echo "Job started at "$(date) 
time1=$(date +%s)
echo ""

FULL_BEDPE_FNAME="${INPUT_BEDPE##*/}"
BEDPE_M_ONLY="${FULL_BEDPE_FNAME%.*}_Monly.bedpe"
BEDPE_NO_UNMAPPED="${BEDPE_M_ONLY%.*}_noUnmapped.bedpe"
BEDPE_NO_SEC="${BEDPE_NO_UNMAPPED%.*}_no256.bedpe"
BEDPE_PP="${BEDPE_NO_SEC%.*}_pp.bedpe"
BEDPE_4MEDE="${FULL_BEDPE_FNAME%.bedpe.gz}_4mede.bedpe"
#BEDPE_4MEDE_CHR_SELECT="${BEDPE_4MEDE%.*}_chrSelect.bedpe"

mkdir -p "${OUT_DIR}/tmp"

echo "Getting fragments with only M in CIGAR..."
zcat ${INPUT_BEDPE} \
    | awk '$12!~/[HIDNSP=X]/' \
    > ${OUT_DIR}/tmp/${BEDPE_M_ONLY}

echo "Removing fragments with unmapped reads..."
cat ${OUT_DIR}/tmp/${BEDPE_M_ONLY} \
    | awk '(!and($14,0x4)) {print}' \
    | awk '(!and($14,0x8)) {print}' \
    | awk '(!and($15,0x4)) {print}' \
    | awk '(!and($15,0x8)) {print}' \
    > ${OUT_DIR}/tmp/${BEDPE_NO_UNMAPPED}

if [ "$KEEP_SECONDARY_READS" != true ]; then
    echo "Removing fragments that have secondary reads..."
    cat ${OUT_DIR}/tmp/${BEDPE_NO_UNMAPPED} \
        | awk '(!and($14,0x100)) {print}' \
        | awk '(!and($15,0x100)) {print}' \
        > ${OUT_DIR}/tmp/${BEDPE_NO_SEC}
fi

#echo "Removing fragments that have supplementary reads..."
### { skip for now }

echo "Removing fragments that are not proper pair..."
cat ${OUT_DIR}/tmp/${BEDPE_NO_SEC} \
    | awk '(and($14,0x2)) {print}' \
    > ${OUT_DIR}/tmp/${BEDPE_PP}

### { no need for this, MeDEStrand does not need qwidth } 
#echo "Get qwidth for sample..."
#QWIDTH=$(cut -f12 ${OUT_DIR}/tmp/${BEDPE_PP} | sort | uniq | sed 's/.$//')
#echo $QWIDTH

echo "Getting only necessary columns for MeDEStrand input..."
cat ${OUT_DIR}/tmp/${BEDPE_PP} \
    | awk 'BEGIN{OFS="\t"} {print $1,$10,$2+1,"1",$5+1,sqrt($17*$17)}' \
    > ${OUT_FILE}

#if [ ! -z "$CHR_SELECT" ]; then
#    echo "Getting only selected chromosomes..."
#    grep -wf ${CHR_SELECT} ${OUT_DIR}/${BEDPE_4MEDE} \
#        > ${OUT_DIR}/${BEDPE_4MEDE_CHR_SELECT}
#    rm ${OUT_DIR}/${BEDPE_4MEDE}
#fi

if [ "$KEEP_TMP" != true ]; then
    echo "Removing tmp directory..."
    rm -r ${OUT_DIR}/tmp/
fi

time2=$(date +%s)
echo "Job ended at "$(date) 
echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds"
echo ""
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
usage(){
    echo 
    echo "Usage: bash getFilterMetrics.sh -r BAM_RAW -a BAM_FILT1 -b BAM_FILT2 -c BAM_FILT3 -d BAM_DEDUPD -o FILT_METRICS" 
    echo 
}
no_args="true"

## Help 
Help()
{   
    # Display Help
    echo 
    echo "Get how many reads were removed during each filtering step."
    echo
    echo "Usage: bash getFilterMetrics.sh -r BAM_RAW -a BAM_FILT1 -b BAM_FILT2 -c BAM_FILT3 -d BAM_DEDUPD -o FILT_METRICS" 
    echo "options:"
    echo "-h   [HELP]      print help"
    echo "-r   [REQUIRED]  raw/unfiltered bam (full path)"
    echo "-a   [REQUIRED]  bam with unmapped and secondary reads removed (full path)"
    echo "-b   [REQUIRED]  bam with reads belonging to inserts shorter than 119nt or greater than 501nt removed (full path)"
    echo "-c   [REQUIRED]  bam with reads with edit distance > 7 from reference removed (full path)"
    echo "-d   [REQUIRED]  bam UMI deduplicated (full path)"
    echo "-o   [REQUIRED]  output directory (full path)"
    echo
}

## Get the options
while getopts ":hr:a:b:c:d:o:" option; do
    case "${option}" in
        h) Help
           exit;;
        r) BAM_RAW=${OPTARG};;
        a) BAM_FILT1=${OPTARG};;
        b) BAM_FILT2=${OPTARG};;
        c) BAM_FILT3=${OPTARG};;
        d) BAM_DEDUPD=${OPTARG};;
        o) FILT_METRICS=${OPTARG};;
       \?) echo "Error: Invalid option"
           exit;;
    esac
    no_args="false"
done

[[ "$no_args" == "true" ]] && { usage; exit 1; }


# Main program ##############################################

echo "Processing getFilterMetrics..." 
echo "Job started at "$(date) 
time1=$(date +%s)

bam_raw=$(samtools view -c ${BAM_RAW})
bam_filter1=$(samtools view -c ${BAM_FILT1})
bam_filter2=$(samtools view -c ${BAM_FILT2})
bam_filter3=$(samtools view -c ${BAM_FILT3})
bam_dedupd=$(samtools view -c ${BAM_DEDUPD})

echo -e "bam_stage\tread_count" > ${FILT_METRICS}
echo -e "bam_raw\t$bam_raw" >> ${FILT_METRICS}
echo -e "bam_filter1\t$bam_filter1" >> ${FILT_METRICS}
echo -e "bam_filter2\t$bam_filter2" >> ${FILT_METRICS}
echo -e "bam_filter3\t$bam_filter3" >> ${FILT_METRICS}
echo -e "bam_dedupd\t$bam_dedupd" >> ${FILT_METRICS}


echo "Finished processing getting filtering metrics."

time2=$(date +%s)
echo "Job ended at "$(date) 
echo "Job took $(((time2-time1)/3600)) hours $((((time2-time1)%3600)/60)) minutes $(((time2-time1)%60)) seconds"
echo ""
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
SEQMETH="F19K16"
SEQUMETH="F24B22"

BAM_PATH=$1
METH_COUNT=$2
METH_QC=$3

samtools view ${BAM_PATH} | \
    cut -f 3 | sort | uniq -c | sort -nr | \
    sed -e 's/^ *//;s/ /\t/' | \
    awk 'OFS="\t" {print $2,$1}' | \
    sort -n -k1,1 \
    > ${METH_COUNT}

total=$(samtools view -c ${BAM_PATH})
unmap=$(cat ${METH_COUNT} | grep '^\*' | cut -f2); if [[ -z $unmap ]]; then unmap="0"; fi
methyl=$(cat ${METH_COUNT} | grep ${SEQMETH} | cut -f2); if [[ -z $methyl ]]; then methyl="0"; fi
unmeth=$(cat ${METH_COUNT} | grep ${SEQUMETH} | cut -f2); if [[ -z $unmeth ]]; then unmeth="0"; fi
pct_meth_ctrl=$(echo "scale=5; ($methyl + $unmeth)/$total * 100" | bc -l); if [[ -z $pct_meth_ctrl ]]; then pct_meth_ctrl="0"; fi
pct_bac_meth=$(echo "scale=5; $methyl/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_meth ]]; then pct_bac_meth="0"; fi
pct_bac_unmeth=$(echo "scale=5; $unmeth/($methyl + $unmeth) * 100" | bc -l); if [[ -z $pct_bac_unmeth ]]; then pct_bac_unmeth="0"; fi
methyl_specificity=$(echo "scale=5; 100 - $pct_bac_unmeth/$pct_bac_meth" | bc -l); if [[ -z $methyl_specificity ]]; then methyl_specificity="0"; fi
bet_meth_ctrl=$(echo "scale=5; $methyl/($methyl + $unmeth)" | bc -l); if [[ -z $bet_meth_ctrl ]]; then bet_meth_ctrl="0"; fi

echo -e "Total_reads\tUnmapped_reads\tNum_reads_align_methyl_F19K16\tNum_reads_align_unmethyl_F24B22\tPctg_reads_aligned_to_BACs\tPctg_BACs_is_methyl_F19K16\tPctg_BACs_is_unmethyl_F24B22\tMethylation_specificity\tMethylation_beta" > ${METH_QC}
echo -e "$total\t$unmap\t$methyl\t$unmeth\t$pct_meth_ctrl\t$pct_bac_meth\t$pct_bac_unmeth\t$methyl_specificity\t$bet_meth_ctrl" >> ${METH_QC}

cat ${METH_QC} \
    | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i
                    max=(max<NF?NF:max)}
                    END {for (i=1; i<=max; i++)
                            {for (j=1; j<=NR; j++) 
                             printf "%s%s", a[i,j], (j==NR?RS:FS)
                            }
                        }' \
    | awk 'BEGIN {OFS="\t"} $1="F19K16_F24B22_methyl_qc\t"$1' \
    > ${METH_QC}.parsed
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
METRIC=$1

cat $2 \
    | grep -A10 "## METRICS CLASS" \
    | grep -v "## METRICS CLASS" \
    | sed '/## HISTOGRAM/,+10d' \
    | awk -F "\t" '{for (i=1; i<=NF; i++) a[i,NR]=$i
                    max=(max<NF?NF:max)}
                    END {for (i=1; i<=max; i++)
                            {for (j=1; j<=NR; j++) 
                             printf "%s%s", a[i,j], (j==NR?RS:FS)
                            }
                        }' \
    | grep -v "^SAMPLE\|^LIBRARY\|^READ_GROUP" \
    | grep -v "ACCUMULATION_LEVEL" \
    | awk -v METRIC=$METRIC 'BEGIN {OFS="\t"} $1=METRIC"\t"$1' \
    > $3
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
library(docopt)
## adapted from https://github.com/oicr-gsi/wf_cfmedip/blob/master/workflow/runMedips/runMedips.r

doc <- "Get MEDIPS QC metrics.
Usage:
    QC_MEDIPS.R --bamFile <FILE> --outputDir <DIR> --windowSize <SIZE> [ --genome <GENOME> ]

Options:
    --bamFile FILE       Aligned, sorted, filtered reads (bam)
    --outputDir DIR      Path to output folder
    --windowSize SIZE    Size of genomic windows (bp, e.g. 300)
    --genome GENOME      Path to a folder containing a custom BSgenome as a package, 
                             which will be loaded using devtools::load_all();
                             or the name of BSgenome (usually BSgenome.Hsapiens.UCSC.hg38
                             or BSgenome.Athaliana.TAIR.TAIR9)
    --help               show this help text
"
opt <- docopt(doc)

library(tidyverse)
library(gtools)
library(MEDIPS)
library(BSgenome.Hsapiens.UCSC.hg38)

#if (file.exists(paste(opt[['genome']], 'DESCRIPTION', sep='/'))) {
#    devtools::load_all(opt[['genome']])
#    bsgenome <- getBSgenome(basename(opt[['genome']]))
#} else {
#    bsgenome <- getBSgenome(opt[['genome']])
#}

if (!file.exists(opt$bamFile)){
  stop(paste0("ERROR: bam file not found ", opt$bamFile), call.=FALSE)
}

## get user parameters
bam_file = opt$bamFile
ws = as.numeric(opt$windowSize)
out_dir = paste0(opt$outputDir, "/")
chr.select=paste0("chr", c(1:22,"X","Y"))
#chr.select=c("chr1","chr22")

BSgenome="BSgenome.Hsapiens.UCSC.hg38"
uniq = 0 ## WARNING: default settings normalize the data, must be set to 0 to disable this transformation
extend = 0 ## relevant for single-end: https://support.bioconductor.org/p/81098/
shift = 0
paired = TRUE

# disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10)
options(scipen = 999)

## create MEDIPS set ###################################
message("Processing MEDIPS QC metrics: window size: ", ws)
MeDIPset =
  MEDIPS.createSet(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   window_size = ws,
                   chr.select = chr.select)

fname <- unlist(strsplit(basename(bam_file),split="\\."))[1]
# fname

## coupling set: maps CG densities across the genome
CS = MEDIPS.couplingVector(pattern="CG", refObj=MeDIPset)

## saturation analysis #################################
## whether a given set of mapped reads is sufficient to generate a saturated and reproducible coverage profile
## calculates Pearson correlation of coverage profile between exclusive sets A and B from a sample

sr =
  MEDIPS.saturation(file = bam_file,
                    BSgenome = BSgenome,
                    uniq = uniq,
                    extend = extend,
                    shift = shift,
                    paired = paired,
                    window_size = ws,
                    chr.select = chr.select,
                    nit = 10, nrit = 1, empty_bins = TRUE, rank = FALSE)
print(paste0("Estimated correlation is: ", round(sr$maxEstCor[2], 5)))
print(paste0("True correlation is: ", round(sr$maxTruCor[2],5)))

pdf(paste0(out_dir, fname, ".MEDIPS.SaturationPlot.pdf"), width = 5, height = 4)
MEDIPS.plotSaturation(sr)
dev.off()

## sequence coverage analysis ##########################
## outputs #of CpGs covered/not covered in a sample

cr =
  MEDIPS.seqCoverage(file = bam_file,
                     pattern = "CG",
                     BSgenome = BSgenome,
                     uniq = uniq,
                     extend = extend,
                     shift = shift,
                     paired = paired,
                     chr.select = chr.select)
print(paste0("Total number of reads: ", cr$numberReads))
print(paste0("Number of reads NOT covering a CpG: ", cr$numberReadsWO))
print(paste0("Fraction of reads NOT covering a CpG: ", round(cr$numberReadsWO / cr$numberReads, 5)))

print(paste0("Number of CpGs in reference: ", length(cr$cov.res)))
print(paste0("Number of CpG not covered by a read: ", length(cr$cov.res[cr$cov.res < 1])))
print(paste0("Number of CpG covered by 1 read: ", length(cr$cov.res[cr$cov.res == 1])))
print(paste0("Number of CpG covered by 2 reads: ", length(cr$cov.res[cr$cov.res == 2])))
print(paste0("Number of CpG covered by 3 reads: ", length(cr$cov.res[cr$cov.res == 3])))
print(paste0("Number of CpG covered by 4 reads: ", length(cr$cov.res[cr$cov.res == 4])))
print(paste0("Number of CpG covered by 5 reads: ", length(cr$cov.res[cr$cov.res == 5])))
print(paste0("Number of CpG covered by >5 reads: ", length(cr$cov.res[cr$cov.res > 5])))

print(paste0("Fraction of CpG not covered by a read: ", round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 1 read: ", round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 2 reads: ", round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 3 reads: ", round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 4 reads: ", round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 5 reads: ", round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by >5 reads: ", round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res),5)))


pdf(paste0(out_dir, fname, ".MEDIPS.seqCovPie.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="pie",
                       cov.level = c(0,1,2,3,4,5),
                       main="Sequence pattern coverage, pie chart")
dev.off()

pdf(paste0(out_dir, fname, ".MEDIPS.seqCovHist.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="hist",
                       t = 15,
                       main="Sequence pattern coverage, histogram")
dev.off()

## CpG enrichment #####################################
## test CpG enrichment of given set of short reads covering a set of genomic regions vs reference genome
## regions.relH - relative freq of CpGs within a sample's immunoprecipitated regions
## genome.relH - relative freq of CpGs within reference genome
## enrichment.score.relH - regions.relH/genome.relH
## regions.GoGe - obs/exp ratio of CpGs within a sample's immunoprecipitated regions
## genome.GoGe - obs/exp ratio of CpGs within genomic regions
## enrichment.score.GoGe - regions.GoGe/genome.GoGe
## (relH and GoGe = 2 different ways of calculating enrichment)


## original MEDIPS.CpGenrich has IRanges issue
## this is adapted from script by Nick Cheng
MEDIPS.CpGenrichNew <-
  function(file=NULL, BSgenome=NULL, extend=0, shift=0, uniq=1e-3, chr.select=NULL, paired=F){

    ## Proof correctness....
    if(is.null(BSgenome)){stop("Must specify a BSgenome library.")}

    ## Read region file
    fileName=unlist(strsplit(file, "/"))[length(unlist(strsplit(file, "/")))]
    path=paste(unlist(strsplit(file, "/"))[1:(length(unlist(strsplit(file, "/"))))-1], collapse="/")
    if(path==""){path=getwd()}
    if(!fileName%in%dir(path)){stop(paste("File", fileName, " not found in", path, sep =" "))}

    dataset = get(ls(paste("package:", BSgenome, sep = ""))[1])

    if(!paired){GRange.Reads = getGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}
    else{GRange.Reads = getPairedGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}

    ## Sort chromosomes
    if(length(unique(seqlevels(GRange.Reads)))>1){chromosomes=mixedsort(unique(seqlevels(GRange.Reads)))}
    if(length(unique(seqlevels(GRange.Reads)))==1){chromosomes=unique(seqlevels(GRange.Reads))}

    ## Get chromosome lengths for all chromosomes within data set.
    cat(paste("Loading chromosome lengths for ",BSgenome, "...\n", sep=""))

    chr_lengths=as.numeric(seqlengths(dataset)[chromosomes])

    ranges(GRange.Reads) <- restrict(ranges(GRange.Reads),+1)

    ##Calculate CpG density for regions
    total=length(chromosomes)
    cat("Calculating CpG density for given regions...\n")

    ## new code ##################################
    readsChars <- unlist(getSeq(dataset, GRange.Reads, as.character=TRUE))

    regions.CG = sum(vcountPattern("CG",readsChars))
    regions.C  = sum(vcountPattern("C",readsChars))
    regions.G  = sum(vcountPattern("G",readsChars))
    all.genomic= sum(width(readsChars))

    nReads <- length(readsChars)
    ###############################################

    regions.relH=as.numeric(regions.CG)/as.numeric(all.genomic)*100
    regions.GoGe=(as.numeric(regions.CG)*as.numeric(all.genomic))/(as.numeric(regions.C)*as.numeric(regions.G))

    cat(paste("Calculating CpG density for the reference genome",
              BSgenome, "...\n", sep = " "))

    CG <- DNAStringSet("CG")
    pdict0 <- PDict(CG)
    params <- new("BSParams", X = dataset, FUN = countPDict, simplify = TRUE, exclude = c("rand", "chrUn"))
    genome.CG=sum(bsapply(params, pdict = pdict0))
    params <- new("BSParams", X = dataset, FUN = alphabetFrequency, exclude = c("rand", "chrUn"), simplify=TRUE)
    alphabet=bsapply(params)
    genome.l=sum(as.numeric(alphabet))
    genome.C=as.numeric(sum(alphabet[2,]))
    genome.G=as.numeric(sum(alphabet[3,]))
    genome.relH=genome.CG/genome.l*100
    genome.GoGe=(genome.CG*genome.l)/(genome.C*genome.G);

    ##Calculate CpG density for reference genome

    enrichment.score.relH=regions.relH/genome.relH
    enrichment.score.GoGe=regions.GoGe/genome.GoGe

    gc()
    return(list(genome=BSgenome,
                regions.CG=regions.CG,
                regions.C=regions.C,
                regions.G=regions.G,
                regions.relH=regions.relH,
                regions.GoGe=regions.GoGe,
                genome.C=genome.C,
                genome.G=genome.G,
                genome.CG=genome.CG,
                genome.relH=genome.relH,
                genome.GoGe=genome.GoGe,
                enrichment.score.relH=enrichment.score.relH,
                enrichment.score.GoGe=enrichment.score.GoGe))
  }

er =
  MEDIPS.CpGenrichNew(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   chr.select = chr.select)

## medips.satr.est_cor and medips.satr.tru_cor involve randomness, will not give identical results on repeat runs
## rest of metrics should be identical on repeat runs

message("Writing out MEDIPS QC metrics: saturation, CpG coverage and CpG enrichment.")
QC_MEDIPS.df =
  data.frame(QC_type = rep("medips_QC", 33),
             metrics = c("ref_genome",
                         "satr.est_cor",
                         "satr.tru_cor",
                         "CpG_cov.totalNumReads",
                         "CpG_cov.numReadsWoCpG",
                         "CpG_cov.fracReadsWoCpG",
                         "CpG_cov.numCpGinRef",
                         "CpG_cov.numCpGwoReads",
                         "CpG_cov.numCpGw1read",
                         "CpG_cov.numCpGw2Reads",
                         "CpG_cov.numCpGw3Reads",
                         "CpG_cov.numCpGw4Reads",
                         "CpG_cov.numCpGw5Reads",
                         "CpG_cov.numCpGgt5Reads",
                         "CpG_cov.fracCpGwoReads",
                         "CpG_cov.fracCpGw1read",
                         "CpG_cov.fracCpGw2Reads",
                         "CpG_cov.fracCpGw3Reads",
                         "CpG_cov.fracCpGw4Reads",
                         "CpG_cov.fracCpGw5Reads",
                         "CpG_cov.fracCpGgt5Reads",
                         "enrich.regions.C",
                         "enrich.regions.G",
                         "enrich.regions.CG",
                         "enrich.genome.C",
                         "enrich.genome.G",
                         "enrich.genome.CG",
                         "enrich.regions.relH",
                         "enrich.genome.relH",
                         "enrich.regions.GoGe",
                         "enrich.genome.GoGe",
                         "enrich.enrichment.score.relH",
                         "enrich.enrichment.score.GoGe"),
             values = c(er$genome,
                        round(sr$maxEstCor[2], 5),
                        round(sr$maxTruCor[2], 5),
                        cr$numberReads,
                        cr$numberReadsWO,
                        round(cr$numberReadsWO / cr$numberReads, 5),
                        length(cr$cov.res),
                        length(cr$cov.res[cr$cov.res < 1]),
                        length(cr$cov.res[cr$cov.res == 1]),
                        length(cr$cov.res[cr$cov.res == 2]),
                        length(cr$cov.res[cr$cov.res == 3]),
                        length(cr$cov.res[cr$cov.res == 4]),
                        length(cr$cov.res[cr$cov.res == 5]),
                        length(cr$cov.res[cr$cov.res > 5]),
                        round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res), 5),
                        er$regions.C,
                        er$regions.G,
                        er$regions.CG,
                        er$genome.C,
                        er$genome.G,
                        er$genome.CG,
                        round(er$regions.relH, 5),
                        round(er$genome.relH, 5),
                        round(er$regions.GoGe, 5),
                        round(er$genome.GoGe, 5),
                        round(er$enrichment.score.relH, 5),
                        round(er$enrichment.score.GoGe, 5)))
names(QC_MEDIPS.df) = c("QC_type", "metrics", fname)
# QC_MEDIPS.df

write.table(QC_MEDIPS.df, paste0(out_dir, fname, "_QC_MEDIPS.csv"), row.names=F, quote=F, sep = '\t')
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
library(docopt)

doc <- "Usage:
MeDEStrandBEDPE.r --inputFile <FILE> --outputFile <FILE> --windowSize <SIZE> --chr_select <CHRS>

--inputFile FILE     Aligned, sorted, filtered bam, or bedpelean
--outputFile FILE    Bedgraph methylation profile
--windowSize SIZE    Size of genomic windows for methylation profiling
--chr_select CHRS    Chromosomes to analyze
--help               show this help text"
opt <- docopt(doc)

#if (!file.exists(opt$inputFile)){
#  stop(paste0("bam or bedpe file not found ",opt$inputFile), call.=FALSE)
#}
#if (!file.exists(opt$outputDir)){
#  dir.create(opt$outputDir)
#}

library(MeDEStrandBEDPE)
library("BSgenome.Hsapiens.UCSC.hg38")
library(GenomicRanges)

args=(commandArgs(TRUE))

# Retrieve user parameters
sample <- opt$inputFile
output <- opt$outputFile
ws <- as.numeric(opt$windowSize)
paired <- TRUE

#  Adapted from: https://github.com/jxu1234/MeDEStrand/blob/master/R/MeDEStrand.createSet.R
#  The original function  uses hardcoded hg19; here, we switch to hg38
MeDEStrand.binMethyl_hg38 <- function(MSetInput=NULL, CSet=NULL, ccObj=NULL, Granges = FALSE){
  for (i in 1:2) {
    if(is.list(MSetInput)){
      MSet=MSetInput[[i]]
    }
    signal =  genome_count(MSet)
    coupling = genome_CF(CSet)
    ccObj = MeDEStrand.calibrationCurve(MSet=MSet, CSet=CSet, input=F)
    index.max = which(ccObj$mean_signal== max(ccObj$mean_signal[1:ccObj$max_index]))
    MS = ccObj$mean_signal[1:index.max]
    CF = ccObj$coupling_level[1:index.max]
    model.data = data.frame( model.MS =  MS/max( MS), model.CF = CF)
    logistic.fit = glm(model.MS ~ model.CF, family=binomial(logit), data = model.data)
    if (i == 1) { cat("Estimating and correcting CG bias for reads mapped to the DNA positive strand...\n") }
    if (i == 2) { cat("Estimating and correcting CG bias for reads mapped to the DNA negative strand...\n") }
    estim=numeric(length(ccObj$mean_signal))  # all 0's
    low_range=1:index.max
    estim[low_range]=ccObj$mean_signal[low_range]
    high_range = ( length(low_range)+1 ):length(estim)
    y.predict = predict(logistic.fit, 
                        data.frame(model.CF = ccObj$coupling_level[high_range]), 
                        type ="response")*ccObj$mean_signal[ccObj$max_index]
    estim[high_range] = y.predict
    signal=signal/estim[coupling+1]
    signal[coupling==0]=0
    signal = log2(signal)
    signal[is.na(signal)] = 0
    minsignal=min(signal[signal!=-Inf])
    signal[signal!=-Inf]=signal[signal!=-Inf]+abs(minsignal)
    maxsignal = quantile(signal[signal!=Inf], 0.9995  )
    signal[signal!=Inf & signal>maxsignal]=maxsignal
    signal=round((signal/maxsignal), digits=2)
    signal[signal==-Inf | signal ==Inf]=0
    if (i == 1) {pos.signal = signal}
    if (i == 2) {neg.signal = signal}
  }
  merged.signal = (pos.signal+neg.signal)/2
  if(!Granges) {
    return(merged.signal)}else{
      chr.select = MSet@chr_names
      window_size = window_size(MSet)
      chr_lengths=unname(seqlengths(BSgenome.Hsapiens.UCSC.hg38)[ seqnames(BSgenome.Hsapiens.UCSC.hg38@seqinfo)%in%chr.select])
      no_chr_windows = ceiling(chr_lengths/window_size)
      supersize_chr = cumsum(no_chr_windows)
      chromosomes=chr.select
      all.Granges.genomeVec = MEDIPS.GenomicCoordinates(supersize_chr, no_chr_windows, chromosomes, chr_lengths, window_size)
      all.Granges.genomeVec$CF = CS@genome_CF
      all.Granges.genomeVec$binMethyl= merged.signal
      return( all.Granges.genomeVec )
    }
}

# Disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10)
options(scipen = 999)

# Set global variables for importing short reads. For details, in R console, type "?MeDEStrand.createSet"
BSgenome="BSgenome.Hsapiens.UCSC.hg38"
uniq = 1
extend = 200
shift = 0
## { change this later to be dynamic }
chr.select = strsplit(opt$chr_select, " ")[[1]]
print(chr.select)
#chr.select = paste0("chr", c(1:22,"X","Y"))

#fname <- unlist(strsplit(basename(opt$inputFile),split="\\."))[1]
#df_for_wig <- NULL
#bed_wig_output <- paste0(opt$outputDir,"/MeDEStrand_hg38_",fname,"_ws",ws,"_wig.bed")

output_df = NULL

tryCatch({

  # Create a MeDIP set
  MeDIP_seq = MeDEStrand.createSet(file=opt$inputFile, BSgenome=BSgenome, extend=extend, shift=shift, uniq=uniq, window_size=ws, chr.select=chr.select, paired=paired)

  #  Count CpG pattern in the bins
  CS = MeDEStrand.countCG(pattern="CG", refObj=MeDIP_seq)

  # Infer genome-wide absolute methylation levels:
  #result.methylation = MeDEStrand.binMethyl(MSetInput = MeDIP_seq, CSet = CS, Granges = TRUE)
  result.methylation = MeDEStrand.binMethyl_hg38(MSetInput = MeDIP_seq, CSet = CS, Granges = TRUE)

  # Create a dataframe from the previous GRanges object.
  # Warning: GRanges and UCSC BED files use different conventions for the genomic coordinates
  # GRanges use 1-based intervals (chr1:2-8 means the 2nd till and including the 8th base of chr1, i.e. a range of length of 7 bases)
  # UCSC bed-files use 0-based coordinates (chr1:2-8 means the 3rd base till and including the 8th base, i.e. a range of length of 6 bases)

  # Dataframe for generating a bed file used to generate then a wig file
  output_df <- data.frame(seqnames=seqnames(result.methylation),
                          starts=start(result.methylation)-1,
                          ends=end(result.methylation),
                          scores=elementMetadata(result.methylation)$binMethyl)

}, error = function(e){
  message("Error: MeDEStrand CpG density normalization failed due to small number of reads")
})

write.table(output_df, file = output, quote=F, sep="\t", row.names=F, col.names=F)
ShowHide 30 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/pughlab/PLBR_MeDEStrandBEDPE
Name: plbr_medestrandbedpe
Version: v0.2.0
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...