Benchmarking Machine Learning Methods for Identifying Mislabeled Data

public public 1yr ago 0 bookmarks

This repo contains code for the maniscript Benchmarking machine learning methods for the identification of mislabeled data .

First, the datasets will be downloaded, processed and saved into the folder datasets as input for filters. ToDo create rules for that

main file: runFiltersAllParallel.py reads

runRFilter.py reads runRFilter.r cleanLabFilter filtersScikiClean utils addNoiseScikit DNNwERLLoss AEFilter (not needed now)

Run the workflow

sbatch_cluster_snakemake -c cluster.json -s Snakefile -j 10 -z -i -n

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import pysam
import pandas as pd
from tqdm import tqdm

file = snakemake.input.cadd
clinvar = snakemake.input.cv
output = str(snakemake.output)
test = snakemake.params.test
if test:
    nrows = 500
else:
    nrows = 1000000000

allYears = pd.read_csv(clinvar, sep = '\t', compression = 'zip', nrows = nrows) 

tabixfile = pysam.TabixFile(file)
header = list(tabixfile.header)[1].split('\t')

lines = []

for row in tqdm(allYears.itertuples(),  total=allYears.shape[0]):
    for line in tabixfile.fetch(row.Chromosome, row.Start-1, row.Start):
        line = line.split('\t')
        fref = line[2]
        falt = line[3]
        if (fref==row.Ref)&(falt==row.Alt):
            lines.append(line)
df = pd.DataFrame(lines)
df.columns = header
df= df.drop_duplicates(subset = header[:4]).reset_index(drop = True)
labels = ['LabelOld', 'LabelNew']
print('Length of CADD annotated variants is {}, length of ClinVar variants {}'.format(len(df), len(allYears)))
df[labels] = allYears[labels]

df.to_csv(output, compression = 'zip',sep =  '\t',
         index = None)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
import sys 
sys.path.insert(0, 'scripts/')
from utils import *

# fill nas with 0
# convert classes to 0,1,2 etc. (somethimes strings or 10,20 etc)
# get dummies for categorical data
# label column os last column and is named Label (or LabelOld, LabelNew for ClinVar)
# can decrease the number of classes or balance the data (NOT DONE)


names = ['Adult','DryBean','Chess','Magic','ClinVarReal','ClinVarArt','RNA0','RNA1','RNA2', 'HEPMASS','Pokerhand', 'IFD']

for i,name in enumerate(names):
    print(name)
    df = pd.read_csv('dataProduced/'+name+'.csv.gz', sep = '\t',compression='zip')
    # n is the numbre of classes
    n = len(df.iloc[:,-1].value_counts())

    # nl is the number of labels
    nl=1
    if name in ['ClinVarReal']:
        nl=2
    else:
        df = df.rename(columns = {df.columns[-1]:'Label'})



    df = getProperLabels(df,nl)
   # df = decreaseClassNumber(df,n) 
   # df = getProperLabels(df,nl) #need this twice when decreasing the number of classes

    data = df[df.columns[~df.columns.str.contains('Label',na=False)]]
    labels = df[df.columns[df.columns.str.contains('Label',na=False)]]
    data = pd.get_dummies(data)
    data = data.join(labels)
    #data = balance(data)
    data = data.fillna(0)
    print(name, data.shape)
    print(data.iloc[:,-1].value_counts())
    data.to_csv('datasets/'+name+'.csv.gz',sep = '\t', index = None, 
             compression = 'zip')
    data = data.sample(n=min(len(data),10000))
    data.to_csv('datasetsSample/'+name+'.csv.gz',sep = '\t', index = None, 
             compression = 'zip')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
import pandas as pd
import os
from glob import glob

output = snakemake.output
test = snakemake.params.test
# not all years have data for all months
years = [2015,2016,2017,2018,2019,2020,2021,2022,2023]  


if test:
    years = [2016]


# the ClinicalSignificance columns has various classes, take only these to avoid ambiguity 
classes = ['Uncertain significance', 'Likely benign', 'Benign', 'Conflicting interpretations of pathogenicity', 
           'Pathogenic', 'Likely pathogenic', 'Benign/Likely benign', 'Pathogenic/Likely pathogenic']

CHR = pd.Series(list(range(1,23))+['X','Y']).astype(str)

dates = []
di = {'Ref': ['ReferenceAllele', 'ReferenceAlleleVCF'],
                 'Alt' : ['AlternateAllele', 'AlternateAlleleVCF' ]}
allYears = pd.DataFrame()
for year in years:
    y = str(year)
    print(year)
    files = pd.Series(glob('dataRaw/ClinVar/'+y+'/*')).sort_values()[:]
    for file in files:
        df_temp = pd.read_csv(file,sep = '\t',low_memory=False, na_values = ['na'])

        # some months in 2015 do not have any allele column: ignore them 
        if not sum(df_temp.columns.str.contains('ReferenceAllele')):
            print(file)
            pass 
        else:

            d = file.split('_')[2].split('.')[0]
            dates.append(d)
            # drop unnecessary information
            df_temp = df_temp[df_temp['Type']=='single nucleotide variant']
            df_temp = df_temp[df_temp['Assembly']=='GRCh38']
            df_temp = df_temp[df_temp['ClinicalSignificance'].isin(classes)]


            for key in di.keys():

                df_temp[key] = df_temp[di[key][0]]
                ind = df_temp[df_temp[key].isna()].index
                # from end of 2020, the ReferenceAllele and AlternateAllele are not used anymore
                # but columns ReferenceAlleleVCF and AlternateAlleleVCF
                if sum(df_temp.columns.str.contains(di[key][1])):

                    df_temp.loc[ind, key] = df_temp.loc[ind, di[key][1]]

            s = len(df_temp)

            cols = ['Chromosome', 'Start', 'Ref', 'Alt']
            df_temp = df_temp.dropna(subset = cols, how = 'any')

            df_temp = df_temp[((df_temp['Ref'].isin(['A', 'T', 'C', 'G']))&(df_temp['Alt'].isin(['A', 'T', 'C', 'G'])))]
            df_temp = df_temp[df_temp['Chromosome'].astype(str).isin(CHR)]
            df_temp['Start'] = df_temp['Start'].astype(int)

            df_temp = df_temp.drop_duplicates(cols)
            print('{}: Length of the joint file is {}, dropped {} variants from loaded file'.format(d,len(df_temp),s-len(df_temp)))
            df_temp = df_temp.set_index(cols, drop = False)
            allYears = pd.concat([allYears, df_temp['ClinicalSignificance']],axis = 1, join = 'outer')

allYears.columns = dates
# unique keeps the order of the labels occurance
uniqueLabels = allYears.T.apply(lambda x: x.dropna().unique())
allYears['uniqueLabels'] = uniqueLabels
allYears['LabelOld'] = allYears['uniqueLabels'].str[0]
allYears['LabelNew'] = allYears['uniqueLabels'].str[-1]
labels = ['LabelOld', 'LabelNew']


to_replace = {'Uncertain significance': 'VUS', 
              'Conflicting interpretations':'Conflicting',
             'Pathogenic/Likely pathogenic': 'Pathogenic',
             'Likely pathogenic' :'Pathogenic',
             'Benign/likely benign' : 'Benign',
             'Likely benign' :'Benign'}
allYears[labels] = allYears[labels].replace(to_replace)


changed = (allYears['LabelOld']!=allYears['LabelNew']).sum()
allYears.index.names = cols

allYears = allYears.sort_index()

print('The final file has {} entries. {} variants changed their clinical interpretation.'.format(len(allYears),changed ))


allYears.to_csv(output[0],index = True, header = True,
                sep='\t', compression = 'zip' )

allYears[labels].to_csv(output[1],index = True, header = True,
                sep='\t', compression = 'zip' )
 3
 4
 5
 6
 7
 8
 9
10
11
12
download=$1


if [[ $download == True ]]; then
    wget https://kircherlab.bihealth.org/download/CADD/v1.6/GRCh38/whole_genome_SNVs_inclAnno.tsv.gz -P dataRaw/
    wget https://kircherlab.bihealth.org/download/CADD/v1.6/GRCh38/whole_genome_SNVs_inclAnno.tsv.gz.tbi dataRaw/
else
    ln -sf /fast/work/groups/ag_kircher/CADD/projects/genome16/whole_genome_GRCh38/all_SNV_inclAnno.tsv.gz dataRaw/
    ln -sf /fast/work/groups/ag_kircher/CADD/projects/genome16/whole_genome_GRCh38/all_SNV_inclAnno.tsv.gz.tbi dataRaw/
        fi
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
output=$1
mkdir -p dataRaw/ClinVar;

for year in 2015 2016 2017 2018 2019; do
    mkdir dataRaw/ClinVar/$year
    for month in 01 02 03 04 05 06 07 08 09 10 11 12; do 
        wget   https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/archive/$year/variant_summary_$year-$month.txt.gz -P dataRaw/ClinVar/$year;
        done
     done


# years 2020 2021 2022 are in different directory
for year in 2020 2021 2022; do
    mkdir dataRaw/ClinVar/$year
    for month in 01 02 03 04 05 06 07 08 09 10 11 12; do 
        wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/archive/variant_summary_$year-$month.txt.gz -P dataRaw/ClinVar/$year;
        done
     done

# year 2023 has only one entry     
for year in 2023; do
    mkdir dataRaw/ClinVar/$year
    for month in 01; do 
        wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/archive/variant_summary_$year-$month.txt.gz -P dataRaw/ClinVar/$year;
        done
     done


echo "done" > $output
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
output=$1

# download the data from archive.ics.uci.edu 

# HEPMASS (download test because smaller)
wget http://archive.ics.uci.edu/ml/machine-learning-databases/00347/all_test.csv.gz -P dataRaw/ 
# PockerHand
wget  https://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data -P dataRaw/
# Internet+Firewall+Data (IFD)
wget  https://archive.ics.uci.edu/ml/machine-learning-databases/00542/log2.csv -P dataRaw/
# Magic
wget https://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data -P dataRaw/
# DryBean
wget https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip -P dataRaw/
unzip dataRaw/DryBeanDataset.zip -d dataRaw/
# Adult
wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data -P dataRaw/
# Chess
wget  https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.data -P dataRaw/

# download RNA data
wget  https://content.cruk.cam.ac.uk/jmlab/atlas_data.tar.gz -P dataRaw/
tar -xzvf dataRaw/atlas_data.tar.gz -C dataRaw/
rm dataRaw/atlas_data.tar.gz


echo "done" > $output
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import pandas as pd
import sys 
sys.path.insert(0, 'scripts/')
from utils import *
import subprocess

test = snakemake.params.test

# Some of the dataProduced need preprocessing 

# HEPMASS
df= pd.read_csv('dataRaw/all_test.csv.gz')
to_save = df.iloc[:,1:]
to_save['Label'] = df['# label']
# same only a sample of 100 000 p
print(to_save['Label'].value_counts())

to_save.sample(n = 100000).to_csv('dataProduced/HEPMASS.csv.gz', sep = '\t',
                                  index = None, compression = 'zip' )   

# PokerHand                
df= pd.read_csv('dataRaw/poker-hand-training-true.data', header = None)
print(df.shape)

df['Label'] = 0
# label is 1 if something in hand, othterwise 0
df.loc[df[10]!=0,'Label'] = 1
df = df.drop(columns = [10])
df.to_csv('dataProduced/Pokerhand.csv.gz', sep = '\t',
                                  index = None, compression = 'zip' )

# IFD      
df= pd.read_csv('dataRaw/log2.csv')
print(df.shape)

to_repl = {'allow':0, 'drop':1, 'deny':2, 'reset-both':3}
df['Label'] = df['Action'].replace(to_repl)
df = df.drop(columns = 'Action')
# delete one class since the frequency is almost zero
df = df[df['Label']!=3]
df.to_csv('dataProduced/IFD.csv.gz', sep = '\t',
                                  index = None, compression = 'zip' )


# Chess
df = pd.read_csv('dataRaw/krkopt.data', header = None)
n = 9
df = decreaseClassNumber(df,n)
df.to_csv('dataProduced/Chess.csv.gz', sep = '\t',
                                  index = None, compression = 'zip' )


# Drybean
df = pd.read_csv('dataRaw/DryBeanDataset/Dry_Bean_Dataset.arff',skiprows = 25
                 , header = None)
df.to_csv('dataProduced/DryBean.csv.gz', sep = '\t',
                                  index = None, compression = 'zip' )

# Adult
df = pd.read_csv('dataRaw/adult.data',header = None)
df.to_csv('dataProduced/Adult.csv.gz', sep = '\t',
                                  index = None, compression = 'zip' )      

# Magic
df = pd.read_csv('dataRaw/magic04.data',skiprows = 0
                 , header = None)
df.to_csv('dataProduced/Magic.csv.gz', sep = '\t',
                                       index = None, compression = 'zip' )

subprocess.run("python scripts/getData/createRNAData.py  {}".format(test), shell=True)
subprocess.run("python scripts/getData/createClinVarData.py  {}".format(test), shell=True)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import umap.umap_ as umap
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from scipy.sparse import csc_matrix

file = str(snakemake.input)
output = str(snakemake.output)
test = snakemake.params.test
if test:
    nrows = 9000000
else:
    nrows = 1000000000

df = pd.read_csv(file,sep = ' ', skiprows = 2, header = None, 
                 nrows = nrows, 
                 dtype = {0: 'int16', 1: 'int32', 2: 'int16'})


row = df[1].to_numpy()
col = df[0].to_numpy()
data = df[2].to_numpy()
d = pd.DataFrame(csc_matrix((data, (row, col))).toarray())
d[0].sum()

d = d[1:]


n_components = d.shape[0]
n_components = 50


reducer = umap.UMAP(n_components = n_components)
scaled_data = StandardScaler().fit_transform(d.values)
embedding = reducer.fit_transform(scaled_data)
pd.DataFrame(embedding).to_csv(output,sep = '\t', index = None, 
                               header = None, compression = 'zip')
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import sys
import numpy as np
import random
import pandas as pd
import warnings 
from time import time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import traceback
from sklearn.preprocessing import StandardScaler


warnings.filterwarnings('ignore')
sys.path.insert(0, 'scripts/')

from runRFilter import *
from cleanLabFilter import CleanLab
from filtersScikiClean import filtersScikiClean
from utils import *
from addNoiseScikit import *
from DNNwERLLoss import DNNwERLLoss
from AEFilter import AEFilter

path = str(snakemake.output)
#path = 'temp/Encode_0.1_100_Asym_KDN_Python.tmp'
#st = 'temp/Magic_0.1_100_Asym_edgeBoostFilter_R.tmp'
st  = path.split('/')[1].split('.tmp')[0].split('_')
name = st[0]
noiseLevel = float(st[1])
datasetSize = int(st[2])
noiseType = st[3]
model = st[4]
imp = st[5]

repeats = int(snakemake.params.repeats)
beta = float(snakemake.params.beta)
plusLayers =  int(snakemake.params.plusLayers)
learningRate = float(snakemake.params.learningRate)
scaling = eval(snakemake.params.scaling)
loss = snakemake.params.loss

di = {'Real' : 2, 'Sym':1, 'Asym' :1}

df = pd.read_csv('datasets/' + name + '.csv.gz', sep = '\t',compression='zip',
                index_col=None)

df = df.fillna(0)
if scaling:
    scaler = StandardScaler()        
    n = 2 if name in ['ClinVarReal'] else 1
    scaler = scaler.fit(df.iloc[:,:-n])

ID = [name, model, noiseLevel, noiseType,datasetSize]

dfMeansCV = pd.DataFrame()
try:
    for r in range(repeats):

        status = 'F' # F:Failure S: Success N: No noise found
        ExtraInfo = ""
        ID = [name, model, noiseLevel, noiseType,datasetSize,r]
        X, y, noisyLabels = getData(df,name, noiseType, noiseLevel, datasetSize)

        X = pd.DataFrame(scaler.transform(X))
        X = X[X.columns[X.nunique() > 1]] # delete rows w/o variability

        noiseInd = y[y!=noisyLabels].index
        dR = pd.DataFrame(np.vstack([X.T, noisyLabels.tolist()]).T)
        t0 = time()

        if imp=='Python':
            foundNoiseInd = filtersScikiClean(X,y,noisyLabels, t = 0.5,n = noiseLevel, model = [model])
        if imp=='CleanLab':
            foundNoiseInd = CleanLab(X,y,noisyLabels)
        if imp=='R':
            foundNoiseInd = getRModel(dR, y, noisyLabels,model = [model])
        if model=='ERL':
            foundNoiseInd, metrics = DNNwERLLoss(X,y,noisyLabels,beta,plusLayers,learningRate, loss)
            ExtraInfo = {}
            ExtraInfo.update({'beta' :beta, 'plusLayers' :plusLayers, 'learningRate' : learningRate, 'scaling' : scaling,
                              'loss' : loss, 
                            #  'metrics' : metrics
                             })

        if model=='AE':
            foundNoiseInd, cms = AEFilter(X,y,noisyLabels)
          #  ExtraInfo = 'cms:{}'.format(cms)


        cv, scores = confusionMatrixScikit(y,noiseInd, foundNoiseInd)

        if len(foundNoiseInd)==0:
            status = 'N'
        else:
            status = 'S'


        cv.index = [model]
        scores.index = [model]
        cv.insert(0,'Status',status)

        t1 = time()
        totalTime = t1-t0
        cv['Execution Time'] = totalTime

        temp = pd.DataFrame([str(ExtraInfo),str(noiseInd.to_list()), str(foundNoiseInd.to_list()),t1],
                 index = ['ExtraInfo','NoiseInd', 'FoundNoiseInd','Time']).T
        temp.index=[model]
        cv = cv.join(scores).join(temp)
        cv.index = [str(ID)]

        dfMeansCV = dfMeansCV.append(cv)
        dfMeansCV = dfMeansCV.round(4)

except Exception as e: 
    print('CAUGHT AN ERROR IN ', str(ID))
    print(e)
    log = traceback.format_exc()

    pd.Series(log).to_csv('logs/'+str(ID)+'.log', sep = '\t',header = False)
    cv = pd.DataFrame([np.nan]*13).T
    cv.insert(0,'Status',status)
    cv.index = [str(ID)]
    dfMeansCV = dfMeansCV.append(cv)




dfMeansCV.iloc[:,:10].to_csv('output/'+name+'_'+imp+'.csv', sep = '\t',header = False, mode='a')
dfMeansCV.iloc[:,:10].to_csv(path, sep = '\t',header = False)

dfMeansCV.to_csv('output/'+name+'_'+imp+'_Extended.csv', sep = '\t',header = False, mode='a')
62
63
script:
    """scripts/runFiltersAllParallel.py"""       
SnakeMake From line 62 of main/Snakefile
81
82
script:
    """scripts/getData/cleanAndSample.py"""
SnakeMake From line 81 of main/Snakefile
93
94
script:
    """scripts/getData/prepareData.py"""    
SnakeMake From line 93 of main/Snakefile
106
107
script:
    """scripts/getData/annotateClinVarWithCADD.py"""
SnakeMake From line 106 of main/Snakefile
117
118
shell:
    """scripts/getData/downloadCADDData.sh {params.download}"""
SnakeMake From line 117 of main/Snakefile
130
131
script:
    """scripts/getData/createClinVarOldNewLabels.py"""
SnakeMake From line 130 of main/Snakefile
142
143
script:
    "scripts/getData/rnaUMAP.py"
SnakeMake From line 142 of main/Snakefile
148
149
shell:
    """scripts/getData/downloadClinVarData.sh {output}"""        
SnakeMake From line 148 of main/Snakefile
155
156
shell:
    """scripts/getData/downloadData.sh {output}"""
SnakeMake From line 155 of main/Snakefile
ShowHide 14 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/nazaretl/MisLaReview
Name: mislareview
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...