Preparing a data set for Deep Learning from zipped ABR raw data files

public 1yr ago Version: Version 1 0 bookmarks

View Workflow

preparing-a-data-set-for-deep-learning-from-zipped — View Workflow

This notebook is about pre-processing the Auditory Brainstem Response (ABR) raw data files provided by Ingham et. al to create a data set for Deep Learning models.

The unprocessed ABR data files are available at Dryad .

Since the ABR raw data are available as zip-archives, these have to be unzipped and the extracted raw data files parsed so that the time series corresponding to the ABR audiograms can be saved in a single csv file.

The final data set contains the ABR time series, an individual mouse identifier, stimulus frequency, stimulus sound pressure level (SPL) and a manually determined hearing threshold. For each mouse there are different time series corresponding to six different sound stimuli: broadband click, 6, 12, 18, 24, and 30 kHz, each of which was measured for a range of sound pressure levels. The exact range of sound levels can vary between the different mice and stimuli.

The following is done:

The zip archives are unpacked.
The extracted ABR raw data files are parsed and collected in one csv file per archive.
The csv files are merged into a data set of time series. Each time series corresponds to an ABR audiogram measured for a mouse at a specific frequency and sound level.
The mouse phenotyping data are available in Excel format. The individual data sheets are combined into one mouse phenotyping data set, maintaining the mouse pipeline and the cohort type mapping. In addition, the hearing thresholds are added to the ABR audiogram data set.
The data sets are curated:
- there is a single curve per mouse, stimulus frequency and sound level,
- each sound level is included in the list of potential sound pressure levels,
- for each mouse for which an ABR audiogram has been measured, mouse phenotyping data are also provided.

Code Snippets

%reload_ext autoreload
%autoreload 2
%matplotlib inline

Jupyter Notebook From line 2 of raw/ABR_dataset_preparation.ipynb

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

Jupyter Notebook ipython From line 8 of raw/ABR_dataset_preparation.ipynb

import os 
import glob
import csv
import time
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from zipfile import ZipFile

Jupyter Notebook Pandas numpy matplotlib From line 13 of raw/ABR_dataset_preparation.ipynb

"""Define the path to the zip-archives"""
path2data = '...'

Jupyter Notebook From line 27 of raw/ABR_dataset_preparation.ipynb

"""
Define potential frequencies measured in Hz, with the exception of 100, 
which stands for a broadband frequency stimulus (click)  
"""
freqs = [100, 6000, 12000, 18000, 24000, 30000]
print(*['potential stimulus frequencies: ' + str(x) if x==100 else str(x)+'Hz' for x in freqs], sep = ", ")

Jupyter Notebook From line 32 of raw/ABR_dataset_preparation.ipynb

"""Define potential sound pressure levels measured in dB"""
sound_levels = [x for x in range(0, 100, 5)] 
print(*['potential sound pressure levels [dB]: ' + str(x) if x==0 else str(x) for x in sound_levels], sep = ", ")

Jupyter Notebook From line 41 of raw/ABR_dataset_preparation.ipynb

"""Define the columns of the final data set"""
columns = []

for col in ['mouse_id', 'frequency', 'sound_level']:
    columns.append(col)
i = 1

time_steps = 1953
while i<= time_steps:
    columns.append('t'+str(i))
    i+=1

"""Required to be able to subsequently exclude test measurements"""    
columns.append('test')

Jupyter Notebook From line 47 of raw/ABR_dataset_preparation.ipynb

def plot_curves(_df, _mouse_id, _freq, _sl=None, _threshold=None):
    """
    Plots ABR curves for a given mouse identifier and frequency.

    Parameters
    ----------
        _df : pandas-data-frame
            A data frame that contains ABR time series in each row. 

        _mouse_id : string
            A given mouse identifier.

        _freq : string
            A given stimulus frequency.

        _sl : string, default 'None'
            A given sound pressure level.

        _threshold : string, default 'None'
            A manually determined hearing threshold for a given sound stimulus.
    """

    data_range = range(1, 1953)
    data_cols = ['t' + str(i) for i in data_range]

    yticks = _df.loc[(_df.mouse_id==_mouse_id) & (_df.frequency==_freq), 'sound_level'].unique()

    plt.rcParams.update({'font.size': 20})
    plt.figure(figsize=(30, 24), dpi=200, facecolor='w', edgecolor='k')
    plt.xlabel('Time steps [overall 10ms]')
    plt.ylabel('Corresponding sound level [dB]')
    plt.title('Mouse ID: ' + str(_mouse_id) + ' - Frequency: ' + str(_freq))
    plt.yticks(yticks, fontsize='small')
    plt.ylim((min(yticks) - 5, max(yticks) + 15))
    if _sl:
        _df1 = _df[(_df['sound_level']==_sl) & (_df['mouse_id']==_mouse_id) & (_df['frequency']==_freq)][data_cols]
        idx = 0
        while idx < len(_df1.index):
            plt.plot(data_range, _sl + 
                     2.5*_df1.iloc[idx],
                     color='#333F50', linewidth=2.5)
            idx+=1
    else:    
        for soundlevel in _df.loc[(_df.mouse_id==_mouse_id) & (_df.frequency==_freq), 'sound_level']:
            plt.plot(data_range, soundlevel + 
                     2.5*_df[(_df['sound_level']==soundlevel) & (_df['mouse_id']==_mouse_id) & (_df['frequency']==_freq)][data_cols].iloc[0],
                     color='#333F50', linewidth=2.5)
        if _threshold is not None: 
            plt.hlines(_threshold, -1, 2000, colors=None, linestyles='dashed', label='threshold', linewidth=5.0)

Jupyter Notebook From line 64 of raw/ABR_dataset_preparation.ipynb

def get_duplicates(_df, _columns): 
    """
    Identifies duplicates by columns in a given dataset.

    Parameters
    ----------
        _df : pandas-data-frame
            A data frame that contains ABR time series in each row.

        _colums : list
            Columns that may contain duplicates.

    Returns
    -------
        A pandas-data-frame containing the duplicated rows from the input data frame.           
    """
    return pd.concat(g for _, g in _df.groupby(_columns) if len(g) > 1)

Jupyter Notebook From line 116 of raw/ABR_dataset_preparation.ipynb

def parse_file_content(_file_content, _df, _sound_levels):
    """
    Parses the contents of an ABR raw data file.

    Parameters
    ----------
        _file_content : bytes
            The bytes of a file in a zip-archive.

        _df : pandas-data-frame
            An empty data frame with specific columns to store the results.

        _sound_levels : list
            List of potential sound pressure levels.

    Returns
    ----------
        _df : pandas-data-frame
            The input data frame populated with the contents of the file. 
    """
    delimiter = '='

    idx = len(_df.index) - 1

    for item in str(_file_content,'utf-8').split('\r\n'):
        if delimiter in item:
            row = item.split(delimiter)
            if row[0]:
                row[0] = row[0].strip()

                if row[0] == 'TraceName':
                    _continue = ('ABR' in row[1])
                    if _continue:
                        split = row[1].split(',')
                        mouse_id = split[0].strip()
                        freq = int(split[1].strip())
                        sl = int(split[2].strip())
                        _continue = sl in _sound_levels
                elif row[0] == 'TraceInfo':
                    if _continue:
                        steps = row[1].split(',')[2]
                        _continue = int(steps) == time_steps
                        if _continue:
                            idx += 1
                            j = 1
                elif 'TraceData' in row[0]:
                    if _continue:
                        _df.at[idx, 'mouse_id'] = mouse_id
                        _df.at[idx, 'frequency'] = freq
                        _df.at[idx, 'sound_level'] = sl
                        _df.at[idx, 'test'] = False

                        for elem in row[1].split(','):
                            try:
                                _df.at[idx, 't'+str(j)] = float(elem.strip())
                                j+=1
                            except ValueError:
                                print("error on", elem, "!")
                elif 'Electrode Amplifier' in row[0]:
                    if _continue: 
                        _df.at[idx, 'test'] = True

    return _df

Jupyter Notebook From line 136 of raw/ABR_dataset_preparation.ipynb

def parse_zip_file2csv(_file, _columns, _sound_levels, _path2file='tmp/'): 
    """
    Extracts a given zip archive, parses the contents of the extracted raw data files and saves the results 
    in a single csv file. 

    Parameters
    ----------
        _file : string
            The name of the ABR raw data files zip archive.

        _columns : list
            The columns of the csv file containing raw data from the archive.

        _sound_levels : list
            List of potential sound pressure levels.

        _path2file : string, default 'tmp/'
            Path to csv file.

    """
    with ZipFile(_file, 'r') as zipFile: 

        fileNames = zipFile.namelist()

        fname = os.path.splitext(os.path.basename(_file))[0] + '.csv'
        fname = _path2file + fname

        for idx, fileName in enumerate(fileNames, 1):

            start_time = time.time() 
            extractedFile = zipFile.read(fileName)

            df = parse_file_content(extractedFile, pd.DataFrame(columns=_columns), _sound_levels)
            df = df.dropna().reset_index()

            with open(fname, 'a') as f:
                df.to_csv(f, mode='a', header=f.tell()==0, index=False)
            del df

            elapsed_time = time.time() - start_time

            print('%d. file: %s (%s)' % (idx, fileName, time.strftime("%H:%M:%S", time.gmtime(elapsed_time))))

Python From line 202 of raw/ABR_dataset_preparation.ipynb

def cleanup_dataset(_df):
    '''
    Cleans up the ABR raw 

    Excludes test traces, drops duplicates and 
    calculates mean value in case of multiple time series for same mouse, frequency and sound level.
    '''
    # exclude test traces
    _df1 = _df[_df.test == False]

    # drop duplicates
    _df2 = _df1.drop_duplicates()

    # keep mean in case of multiple time series for same mouse, frequency and sound level
    _df3 = _df2.groupby(['mouse_id', 'frequency', 'sound_level']).mean().reset_index()

    return _df3

Jupyter Notebook From line 247 of raw/ABR_dataset_preparation.ipynb

zip_files = glob.glob(path2data + '/*.zip')
for idx, zip_file in enumerate(sorted(zip_files), 1):
    print('%d. %s' % (idx, os.path.basename(zip_file)))

Jupyter Notebook From line 267 of raw/ABR_dataset_preparation.ipynb

start_time = time.time()
print('\nStart time: %s' % time.strftime("%H:%M:%S", time.gmtime(start_time)))

for idx, zip_file in enumerate(zip_files, start=1): 
    basename = os.path.basename(zip_file)
    print('\n%d. zip archive: %s\n' % (idx, basename))
    if not os.path.exists(os.path.splitext(basename)[0] + '.csv'):
        try: 
            parse_zip_file2csv(zip_file, columns, sound_levels)
        except NotImplementedError:
            print('%s: %s' % ('NotImplementedError', basename))
        except NameError:
            print('%s: %s' % ('NameError', basename))

elapsed_time = time.time() - start_time            
print('\nElapsed time: %s' % time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

Jupyter Notebook From line 273 of raw/ABR_dataset_preparation.ipynb

"""Get list of csv files"""
csv_files = glob.glob('tmp/*.csv')
for idx, csv_file in enumerate(sorted(csv_files),1):
    print('%d. %s' % (idx, os.path.basename(csv_file)))

Jupyter Notebook From line 292 of raw/ABR_dataset_preparation.ipynb

"""Process data from the csv files to create an ABR data set"""
df = None
for idx, csv_file in enumerate(csv_files, 1):
    print('%d. %s' % (idx, os.path.basename(csv_file)))
    try:
        _df1 = pd.read_csv(csv_file)
        _df1 = _df1[_df1.frequency.isin(freqs) & _df1.sound_level.isin(sound_levels)]
        _df1.drop(columns=['index'], inplace=True)
        _df2 = cleanup_dataset(_df1)

        print('  stimulus frequencies: %s' % set(_df2.frequency))
        print('  sound levels: %s' % set(_df2.sound_level))
        print('  number of mouse ids: %d' % _df2.mouse_id.nunique())

        if df is None: 
            df = _df2.copy()
        else: 
            df = pd.concat([df, _df2.copy()], ignore_index=True)

        del _df2
        del _df1

    except Exception: 
        print('Error :-(')
df.head()

Jupyter Notebook From line 299 of raw/ABR_dataset_preparation.ipynb

"""Check if any test curves in the data set"""
if True not in df.test.unique():
    print('There are no test curves in the data set.')
else: 
    print('The data set also contains test curves.')

Jupyter Notebook From line 327 of raw/ABR_dataset_preparation.ipynb

"""Get number of mice in the data set"""
print('Mice: %d' % df.mouse_id.nunique())

Jupyter Notebook From line 335 of raw/ABR_dataset_preparation.ipynb

"""Define path to file containing the mouse phenotyping data set"""
file = os.path.join(path2data, 'ABR_RESOURCE_Mouse ID.xlsx')
"""Read first Excel spreasheet"""
mouse_data = pd.read_excel(file, sheet_name='Pipeline2 Controls', engine='openpyxl', usecols='B:R', parse_dates=True)
"""Delete empty rows"""
mouse_data = mouse_data.dropna(axis=0, how='all').reset_index()
"""Delete the index column"""
mouse_data.drop(columns=['index'], inplace=True)
"""Fill in the cohort type column. Possible values: 'con' for controls, 'mut' for mutants"""
mouse_data.at[:,'cohort_type'] = 'con'
"""Read remaining Excel spreadsheets"""
for sheet in ['Pipeline2 Mutants', 'MouseGP Controls', 'MouseGP Mutants', 'MGP Select Controls', 'MGP Select Mutants']:
    _mouse_data = pd.read_excel(file, sheet_name=sheet, engine='openpyxl', usecols='B:R', parse_dates=True)
    _mouse_data = _mouse_data.dropna(axis=0, how='all').reset_index()
    _mouse_data.drop(columns=['index'], inplace=True)
    _mouse_data.rename(columns={'Prefix': 'Colony Prefix', 
                                'Barcode': 'Mouse Barcode', 
                                'Name': 'Mouse Name', 
                                'Age': 'Age at Test'}, inplace=True)
    if 'Mutants' in sheet: 
        _mouse_data.at[:,'cohort_type'] = 'mut'
    else:
        _mouse_data.at[:,'cohort_type'] = 'con'
    mouse_data = mouse_data.append(_mouse_data, ignore_index=True)
display(mouse_data.head(5))

Jupyter Notebook From line 340 of raw/ABR_dataset_preparation.ipynb

"""Delete rows that do not have a valid mouse barcode"""
mouse_data = mouse_data[mouse_data['Mouse Barcode'] != 'Mouse Barcode'].reset_index(drop=True)
"""Define new column for mouse IDs"""
mouse_data['mouse_id'] = mouse_data['Mouse Barcode'] + ' ABR'

Jupyter Notebook From line 368 of raw/ABR_dataset_preparation.ipynb

"""Check if the number of mice in the data set changed"""
print('Mice: %d' % mouse_data.mouse_id.nunique())

Jupyter Notebook From line 375 of raw/ABR_dataset_preparation.ipynb

"""Always keep the first of duplicated rows"""
mouse_data = mouse_data.drop_duplicates(['mouse_id', 'Click Threshold', '6kHz Threshold', '12kHz Threshold', '18kHz Threshold', '24kHz Threshold', '30kHz Threshold'])
"""Check if duplicated rows still exist"""
duplicated = mouse_data[mouse_data.duplicated(['mouse_id', 'Click Threshold', '6kHz Threshold', '12kHz Threshold', '18kHz Threshold', '24kHz Threshold', '30kHz Threshold'])]
if duplicated.empty:
    print('There are no duplicated rows.')
else:
    display(duplicated)

Jupyter Notebook From line 380 of raw/ABR_dataset_preparation.ipynb

"""Check if the number of mice in the data set changed"""
print('Mice: %d' % mouse_data.mouse_id.nunique())

Jupyter Notebook From line 391 of raw/ABR_dataset_preparation.ipynb

"""Check for possible values for frequency-specific hearing thresholds"""
print('Existing hearing thresholds')
for col in ['Click Threshold', '6kHz Threshold', '12kHz Threshold', '18kHz Threshold', '24kHz Threshold', '30kHz Threshold']:
    print(' * %s [dB]: %s' % (col.split(' ')[0], sorted(list(mouse_data[col].unique()))))

Jupyter Notebook From line 396 of raw/ABR_dataset_preparation.ipynb

"""Make sure that mouse phenotyping data are available for all mice with measured ABR curves"""
df = df[df.mouse_id.isin(mouse_data.mouse_id.unique())].reset_index(drop=True)
print('Mice with measured ABR curves: %d' % df.mouse_id.nunique())

Jupyter Notebook From line 403 of raw/ABR_dataset_preparation.ipynb

"""Make sure that ABR curves have been measured for all mice with phenotyping data"""
mouse_data = mouse_data[mouse_data.mouse_id.isin(df.mouse_id.unique())].reset_index(drop=True)
print('Mice with phenotyping data: %d' % mouse_data.mouse_id.nunique())

Jupyter Notebook From line 409 of raw/ABR_dataset_preparation.ipynb

"""Map the hearing threshold columns to corresponding stimulus frequencies"""
col_mapping = {100: 'Click Threshold', 
               6000: '6kHz Threshold', 
               12000: '12kHz Threshold', 
               18000: '18kHz Threshold', 
               24000: '24kHz Threshold', 
               30000: '30kHz Threshold'}

Jupyter Notebook From line 415 of raw/ABR_dataset_preparation.ipynb

"""Add a hearing threshold column to ABR data set"""
df1 = None
for freq in col_mapping:
    print('stimulus frequency: %d%s' % (freq, '' if freq == 100 else 'Hz'))
    col = col_mapping[freq]
    df_freq = df.loc[df.frequency == freq]
    df_freq = pd.merge(left=df_freq, right=mouse_data[['mouse_id', col]], on='mouse_id', how='left')
    df_freq.rename(columns={col: 'threshold'}, inplace=True)
    if df1 is None:
        print(' create df1 ...')
        df1 = df_freq.copy()
    else: 
        print(' concat results ...')
        df1 = pd.concat([df1, df_freq])
    del df_freq
display(df1.head(5))

del df

Jupyter Notebook From line 425 of raw/ABR_dataset_preparation.ipynb

"""Always keep the first of duplicated rows"""  
df1 = df1.drop_duplicates()
"""Check if duplicated rows still exist"""
duplicated = df1[df1.duplicated()]
if duplicated.empty: 
    print('There are no duplicated rows.')
else:
    display(duplicated)

Jupyter Notebook From line 446 of raw/ABR_dataset_preparation.ipynb

"""Check if number of mice in the data set changed"""
print('Mice: %d' % df1.mouse_id.nunique())

Jupyter Notebook From line 457 of raw/ABR_dataset_preparation.ipynb

"""List existing stimulus frequencies"""
print('Existing stimulus frequencies: %s' % df1.frequency.unique())
"""List existing sound levels"""
print('Existing sound levels: %s' % df1.sound_level.unique())

Jupyter Notebook From line 462 of raw/ABR_dataset_preparation.ipynb

"""Replace NaN threshold values"""
AUL = 999
df1['threshold'] = df1['threshold'].fillna(AUL)
df1['threshold'] = df1['threshold'].astype('int32')
print('Existing hearing thresholds [dB]: %s' % sorted(df1['threshold'].unique()))

Jupyter Notebook From line 469 of raw/ABR_dataset_preparation.ipynb

"""Keep only sound levels from the potential sound levels list"""
df1 = df1[df1.threshold.isin(sound_levels + [AUL])]
print('Existing hearing thresholds [dB]: %s' % sorted(df1['threshold'].unique()))

Jupyter Notebook From line 477 of raw/ABR_dataset_preparation.ipynb

"""Check if number of mice in the data set changed"""
print('Mice: %d' % df1.mouse_id.nunique())

Jupyter Notebook From line 483 of raw/ABR_dataset_preparation.ipynb

"""Checking for mouse IDs with multiple hearing thresholds for a given stimulus frequency"""
mouse_ids = df1[df1.columns.drop('threshold')][df1[df1.columns.drop('threshold')].duplicated()].mouse_id.unique()
print('Mouse IDs: %s' % mouse_ids)
"""Exclude these mouse IDs from the data set"""
if mouse_ids.any(): 
    display(mouse_data[mouse_data.mouse_id.isin(mouse_ids)])
    df2 = df1[~df1.mouse_id.isin(mouse_ids)]

del df1

Jupyter Notebook From line 488 of raw/ABR_dataset_preparation.ipynb

"""Make sure the mouse phenotyping data set contains only mice having ABR curves measured for valid sound levels"""
mouse_data2 = mouse_data[mouse_data.mouse_id.isin(df2.mouse_id.unique())].reset_index(drop=True)

Jupyter Notebook From line 500 of raw/ABR_dataset_preparation.ipynb

"""Both data sets should have the same number of mice"""
print('%d mice with ABR curves = %d mice with phenotyping data : %s' % (df2.mouse_id.nunique(), mouse_data2.mouse_id.nunique(), (df2.mouse_id.nunique()==mouse_data2.mouse_id.nunique())))

Jupyter Notebook From line 505 of raw/ABR_dataset_preparation.ipynb

"""Save ABR curves data set to csv file"""
df2[df2.columns.drop('test')].to_csv('abr_curves.csv', index=False)
display(df2.head(5))

Jupyter Notebook From line 510 of raw/ABR_dataset_preparation.ipynb

"""Save mouse phenotyping data set to csv file"""
mouse_data2.to_csv('mouse_data.csv', index=False)
display(mouse_data2.head(5))

Jupyter Notebook From line 516 of raw/ABR_dataset_preparation.ipynb

import matplotlib.gridspec as gridspec
import matplotlib.ticker as ticker

import random

Jupyter Notebook matplotlib From line 522 of raw/ABR_dataset_preparation.ipynb

"""Create random list of mouse IDs"""
mice = random.sample(list(df2.mouse_id.unique()), 100)

Jupyter Notebook From line 529 of raw/ABR_dataset_preparation.ipynb

plt.rcParams['figure.figsize'] = [10, 8]

"""Define columns with time series data"""
data_cols = ['t%d' %i for i in range(1, 1951)] 

data_range = range(1, 1951)

for mouse in mice[:1]: 

    fig = plt.figure(constrained_layout=True, figsize=(80, 64))

    sound_levels = df2['sound_level'].unique()
    df = df2[df2.mouse_id == mouse]

    cols = 2
    rows = 3 #int(len(df.frequency.unique()) / cols)
    col = 0
    row = 0
    spec = gridspec.GridSpec(ncols=cols, nrows=rows, figure=fig)
    f_ax = {}

    for idx, freq in enumerate(df.frequency.unique()):

        f_ax[idx] = fig.add_subplot(spec[row, col])
        if freq == 100: 
            f_ax[idx].set_title('Click')
        else:
            f_ax[idx].set_title('%dkHz' % (freq/1000))
        f_ax[idx].set_yticks(sound_levels)

        """Get hearing threshold for given stimulus frequency"""
        human_thr = None
        thr = df[df['frequency'] == freq]['threshold'].unique()
        if len(thr) > 0:
            human_thr = thr[0]
        """Plot the curves"""
        plt.rcParams.update({'font.size': 20})
        f_ax[idx].set_xlabel('Timesteps [overall 10ms]')
        f_ax[idx].set_ylabel('Sound level [dB]')
        if freq == 100:
            f_ax[idx].set_title('Click - manually assigned threshold: %sdB' % human_thr)
        else:
            f_ax[idx].set_title('%dkHz - manually assigned threshold: %sdB' % (freq/1000, human_thr))

        for sound_level in df.loc[df['frequency'] == freq, 'sound_level']:
            f_ax[idx].plot(data_range, sound_level +
                           2.5 * df[(df['sound_level'] == sound_level) & (df['frequency'] == freq)][data_cols].iloc[0],
                           linewidth=2.5)

        if human_thr and human_thr != 999:
            f_ax[idx].hlines(y=human_thr,
                             xmin=data_range[0], xmax=data_range[-1],
                             linewidth=2.5, linestyles='dotted')

        col += 1
        if col == cols:
            row += 1
            col = 0

        labels = [sl for sl in sound_levels]
        f_ax[idx].yaxis.set_major_formatter(ticker.FixedFormatter(labels))

    fig.suptitle('Mouse ID: %s' % mouse, fontsize=24)
#     _file = 'curves/' + mouse.replace(' ', '_')
#     plt.savefig(_file)