Artifact detection algorithms¶

Classic algorithms¶

In [1]:
import numpy as np
import scipy.stats as stats
import pandas as pd


class artefact_recognition:


    def iqr(df, column_list, factor = 3):
        """ Executes artefact recognition by Interqurtile Range
        INPUT:
            df: df of vitals for one patient
            column_list: list of columns to remove artefacts from.
            factor: factor of iqr above data will be dropped - default is 3
        OUTPUT:
            cleaned df."""
        for i in column_list:
            df = df.drop(df[df[i] < (np.percentile(df[i], 25) - factor * stats.iqr(df[i]))].index)
            df = df.drop(df[df[i] > (np.percentile(df[i], 75) + factor * stats.iqr(df[i]))].index)
        return df 

    def z_value(df, column_list):
        """ Executes artefact recognition by z-value
        INPUT:
            df: df of vitals for one patient
            column_list: list of columns to remove artefacts from. 
        OUTPUT:
            cleaned df."""
        z_scores = stats.zscore(df[column_list])
        abs_z_scores = np.abs(z_scores)
        filtered_entries = (abs_z_scores < 3).all(axis=1)
        return df[filtered_entries]
    
    cut_off_df = pd.DataFrame(data = [[20, 300], [10,250], [5, 225], [20, 300], [10,250], [5, 225], [5,150], [0,100],[25,45], [25,45], [5,300], [5,150]], columns = ['mi', 'ma'], index = ['sys', 'map', 'dia', 'n_sys', 'n_map', 'n_dia', 'etCO2', 'SpO2', 'Ta', 'Tb', 'HF', 'CO2'])

    def cut_off(df, column_list, cut_offs = cut_off_df):
        """ Executes artefact recognition by applying pre-defined Cut-offs
        INPUT:
            df: df of vitals for one patient 
            column_list: list of columns to remove artefacts from. 
            cut_offs: dataframe with names of columns and each min (mi) and max(ma) -default is a predefined df for sys, map, dia, etco2, spo2, ta, tb, hf
        OUTPUT:
            cleaned df."""
        for i in column_list:
            df = df.drop(df[df[i] < cut_offs.loc[i]['mi']].index)
            df = df.drop(df[df[i] > cut_offs.loc[i]['ma']].index)
        return df

Local outlyer factor¶

In [231]:
def lrds_(dataset, cols, neighbors, distances):
    lrds = []

    for A, *Nk in neighbors:
        dis = distances[A]
    
        xA, yA = dataset.loc[A, cols]

        sum_reach_distance = 0
        
        for B in Nk:
            if B < len(dataset):
                
                k_distance_B = dataset.loc[B, 'k_distance_B']    

                xB, yB = dataset.loc[B, cols]

                dAB = distance.euclidean([xA, yA], [xB, yB])

                reach_distance_AB = max(k_distance_B, dAB)
                sum_reach_distance += reach_distance_AB

        if(len(Nk) == 0):
            print('zero entry neighbors list')
            
        if(sum_reach_distance == 0):
            sum_reach_distance = 1e-9

                
        lrd_A = 1/(sum_reach_distance/len(Nk))
        lrds.append(lrd_A)

    return lrds
In [232]:
def lofs_(lrds, neighbors, i):
    lof = []

    if(len(lrds) < len(neighbors[0])):
        return lrds

    for A, *Nk in neighbors:
        tmp = []
        for B in Nk:
            try:
                tmp.append(lrds[B])
            except IndexError:
                print(lrds)
                print(len(neighbors))
                print(B)
            
        sum_lrds = sum(tmp)
        lof.append(sum_lrds/(len(Nk) * lrds[A]))

    return lof
In [233]:
from scipy.spatial import KDTree, distance

def lof(df, column_list, max_lof_threshold=1.5):
    x = df.copy()
    x.reset_index(inplace=True)
    x['seconds'] = (x['measurement_datetime'].astype('int64') // 1e9).astype('int64')
    
    # column = 'dia'
    k = 7
    
    lofs = []
    
    for column in column_list:    
        kd = KDTree(x[['seconds', column]])

        # get the 7 nearest neigbors and their distance to each and every point
        kdquery = kd.query(x[['seconds', column]], k=k)
        distances, neighbors = kdquery

        # add the k-distance to B to the df
        x['k_distance_B'] = distances[:, -1]

        lrds = lrds_(x, ['seconds', column], neighbors, distances)
        lofs.append(lofs_(lrds, neighbors, 0))
    

    return x[(np.array(lofs) < max_lof_threshold).any(axis = 0)]

LSTM¶

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras

# Loading Dataset with pandas
df = pd.read_csv('vitalsign_dataset.csv')
2023-08-01 15:38:46.000211: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-01 15:38:46.798473: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [ ]:
objects = pd.read_pickle("patient_list.pkl")
icu = np.asarray(objects)
In [4]:
def get_group(patient_id, icu):
    for i, pat_id in enumerate(icu[:,5]):   
        """
        Retrieves the group of a patient from the ICU based on their ID.
    
        Args:
            patient_id (int): The ID of the patient to search for.
            icu (array-like): The ICU data structure containing patient information.
    
        Returns:
            str or None: The group of the patient if found, or None if the patient is not in the ICU.
    
        """
        if(int(patient_id)==int(pat_id)):
            return icu[i,1]
    return None
In [7]:
df["artefact"] = df["artefact"].astype(int)
In [8]:
features = df.columns
print(features)
Index(['measurement_id', 'person_id', 'measurement_concept_id',
       'value_as_number', 'measurement_datetime', 'Vitalsign', 'artefact'],
      dtype='object')
In [9]:
def combine_rows(data, start=0, end=100, window_size=2, b_make_sequence=False, window_future_steps = 0):
    """
    Combines rows of data into a single array with specified window size.

    Args:
        data (array-like): The input data to be combined.
        start (int, optional): The starting index for combining rows. Defaults to 0.
        end (int, optional): The ending index (exclusive) for combining rows. Defaults to 100.
        window_size (int, optional): The size of the sliding window. Defaults to 2.
        b_make_sequence (bool, optional): If True, the function returns a sequence of combined rows. 
                                          If False, it returns a 2D array with combined rows. 
                                          Defaults to False.
        window_future_steps (int, optional): The number of future steps to include in each combined row. 
                                             Defaults to 0.

    Returns:
        array-like: The combined rows of data.

    """
    
    if(b_make_sequence):
        main_data = []

    else:
        main_data = np.zeros((end-start, data.shape[1]*window_size))
    size = data.shape[1]
    for i in range(start, end):
        input_data = []
        index = 0;
        for j in range(i - window_size+1 , i+window_future_steps):
            if(not (j<0 or j<start or j>=end)):
                if(b_make_sequence):
                    input_data.append(data[j,:])
                        
                else:
                    main_data[i, size*index:size*index+size] = data[j,:]
            else:
                if(b_make_sequence):
                     input_data.append(np.zeros(data[0,:].shape))
            index+=1
        main_data.append(input_data)
    if(b_make_sequence):
        return np.array(main_data).astype(float)
    else:
        return main_data.astype(float)
In [10]:
from sklearn.model_selection import train_test_split
def create_dataset(df):

    
    df = df.drop(columns=['measurement_id'], axis=0)
    df = df.drop(columns=['measurement_concept_id'], axis=0) 
    df["measurement_datetime"] = pd.to_datetime(df["measurement_datetime"]).astype('int64')/ 10**9
    df = df.sort_values(['person_id', 'measurement_datetime']).reset_index(drop=True)

    
    data = df.to_numpy()

    #this was a try to train a model for all artifacts, we use onehot later anyway
    data = np.where(data=='HF', -0.75, data) 
    data = np.where(data=='map', -0.5, data) 
    data = np.where(data=='dia', -0.25, data) 
    data = np.where(data=='sys', 0., data) 
    data = np.where(data=='SpO2', 0.25, data) 
    data = np.where(data=='CO2', 0.5, data) 
    data = np.where(data=='Ta', 0.75, data)    
    
    return data.astype(float)
In [11]:
def AdmissionID_combine_rows(data, start=0, end=100, window_size=2, b_make_sequence=False, window_future_steps = 0, AdmissionIDs = []):
    """
    Combines rows of data based on specified AdmissionIDs into a single array with a specified window size.

    Args:
        data (array-like): The input data to be combined.
        start (int, optional): The starting index for combining rows. Defaults to 0.
        end (int, optional): The ending index (exclusive) for combining rows. Defaults to 100.
        window_size (int, optional): The size of the sliding window. Defaults to 2.
        b_make_sequence (bool, optional): If True, the function returns a sequence of combined rows. 
                                          If False, it returns a 2D array with combined rows. 
                                          Defaults to False.
        window_future_steps (int, optional): The number of future steps to include in each combined row. 
                                             Defaults to 0.
        AdmissionIDs (list, optional): A list of AdmissionIDs indicating the sections of data to combine. 
                                       Defaults to an empty list.

    Returns:
        array-like: The combined rows of data.

    """
    
    if(len(AdmissionIDs)>0):
        main_data = combine_rows(data, AdmissionIDs[0], AdmissionIDs[1], window_size, b_make_sequence, window_future_steps)
        for i in range(1, len(AdmissionIDs)-1):   
            
            main_data =np.append(main_data, combine_rows(data, AdmissionIDs[i], AdmissionIDs[i+1], window_size, b_make_sequence, window_future_steps), axis=0)
        
        main_data =np.append(main_data, combine_rows(data, AdmissionIDs[i+1], end, window_size, b_make_sequence, window_future_steps), axis=0)
        
        return main_data
    else:
        return _
        
In [12]:
def process_data(data_np, full_data, number_inputs=40, number_of_static_inputs = 0, num_prev_index = 16, main_label_index = -6, history_length=10, b_remove_artefacts=True, b_make_sequence=False, batch_size=1024, b_generate_test_data=False, window_future_steps = 0, b_one_hot=True):
    """
    Processes the data by applying various transformations and returns the preprocessed data.

    Args:
        data_np (array-like): The input data to be processed.
        full_data (array-like): The full dataset for computing statistics.
        number_inputs (int, optional): The number of input features. Defaults to 40.
        number_of_static_inputs (int, optional): The number of static input features. Defaults to 0.
        num_prev_index (int, optional): The number of previous indices to consider. Defaults to 16.
        main_label_index (int, optional): The index of the main label. Defaults to -6.
        history_length (int, optional): The history length for creating sequential data. Defaults to 10.
        b_remove_artefacts (bool, optional): If True, removes artefacts from the data. Defaults to True.
        b_make_sequence (bool, optional): If True, returns data as a sequence. If False, returns 2D array data.
                                          Defaults to False.
        batch_size (int, optional): The batch size for splitting the data into batches. Defaults to 1024.
        b_generate_test_data (bool, optional): If True, generates test data. Defaults to False.
        window_future_steps (int, optional): The number of future steps to include in each combined row.
                                             Defaults to 0.
        b_one_hot (bool, optional): If True, applies one-hot encoding to the labels. Defaults to True.

    Returns:
        tuple or array-like: The preprocessed data, AdmissionDataIndices, x_train, and y_train if b_generate_test_data
                             is True. Otherwise, x_train and y_train.

    """
    
    
    if(history_length<1):
        history_length = 1
    
    data = data_np
    
    chunksize = data.shape[0]
    
    
    AdmissionDataIndices = []
    admission_id = 1


            
            
    data_nan = np.zeros(data.shape)

    for i in range(len(data)):
        for j in range(len(data[0])):
            if(np.isnan(data[i,j])):
                data[i,j] = 0
                data_nan[i,j] = 1

    num_of_labels = data.shape[1]-number_inputs        
       
    max_values = []
    min_values = []
    mean_values = []
    std_values = []

    for i in range(len(data[0])):    
        max_values.append(np.max(full_data[:,i]))
        min_values.append(np.min(full_data[:,i]))
        mean_values.append(np.mean(full_data[:,i]))
        std_values.append(np.std(full_data[:,i]))
        if(b_remove_artefacts):
            for j in range(len(full_data)):
                if((full_data[j,i]>(mean_values[i]+7*std_values[i])) or (full_data[j,i]<(mean_values[i]-7*std_values[i]))):
                    print('Artefact')
                    full_data[j,i] = 0
                    if(j<data.shape[0] and i<data.shape[1]):
                        data[j,i] = 0
                        data_nan[j,i] = 1

    if(b_remove_artefacts):
        max_values.append(np.max(full_data[:,i]))
        min_values.append(np.min(full_data[:,i]))
        mean_values.append(np.mean(full_data[:,i]))
        std_values.append(np.std(full_data[:,i])) 
    
    
    abs_max_values =  []
    mean_values = []
    
    

    

    for i in range(0,len(data[0])):    
        abs_max_values.append(max(np.max(data[:,i]), abs(np.min(data[:,i]))))                   
        mean_values.append(np.mean(data[:,i]))
        if(i<data.shape[1]-num_of_labels-number_of_static_inputs):         
            if(abs_max_values[i]==0):
                data[:,i] = (data[:,i])
            else:
                data[:,i] = (data[:,i]-mean_values[i])/(abs_max_values[i]+1E-7)
    
    for i in range(len(data)):
        for j in range(len(data[0])):
            if(data_nan[i,j] == 1):
                data[i,j] = 0
    
    dif_data = np.diff(data[:,1:number_inputs-number_of_static_inputs], axis=0)       
    dif_data = np.append(dif_data, np.zeros((1,dif_data.shape[1])), axis=0)
    
    acc_data = np.diff(dif_data, axis=0)
    acc_data = np.append(acc_data, np.zeros((1,acc_data.shape[1])), axis=0)
    

 
    for i, ele in enumerate(data[:,0]):
        if(ele != admission_id):
            admission_id = ele
            AdmissionDataIndices.append(i)
            dif_data[i,:] = np.zeros_like(dif_data[i,:])
            acc_data[i,:] = np.zeros_like(acc_data[i,:])
    
    
    j=0
    prev_index = num_prev_index
    label_index = main_label_index
    current_id = AdmissionDataIndices[0]
    hypo_save = np.copy(data[:,label_index])
    hypo_count = 0
    
    if(num_prev_index>1):
        for i in range(len(data)-prev_index):
            if(i> AdmissionDataIndices[j]):
                if(j<len(AdmissionDataIndices)-1):
                    j+=1


            if((i+prev_index)<AdmissionDataIndices[j]):
                if(hypo_save[i]==1):
                    data[i,label_index ] =  2
                    hypo_count +=1
                    for x in range(i-prev_index,i):
                        if(hypo_save[x]!=1 and x>AdmissionDataIndices[j-1]):
                            data[x,label_index ] = 1
    
    if(history_length>1):
        if(b_make_sequence):
            data_full = np.zeros((data.shape[0], data.shape[1]+dif_data.shape[1]+acc_data.shape[1]))               
        else:
            data_hist = np.zeros((data.shape[0], (data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1])*history_length+ num_of_labels))  
            data_full = np.zeros((data.shape[0], data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1]))          
          
    else:
        #if(b_make_sequence):s
        #    data_full = np.zeros((data.shape[0], data.shape[1]+dif_data.shape[1]+acc_data.shape[1]))
        #else:
        data_full = np.zeros((data.shape[0], data.shape[1]+dif_data.shape[1]+acc_data.shape[1]))
            
            
    data_full[:,0:number_inputs] = data[:,0:number_inputs]
    data_full[:,number_inputs:number_inputs+dif_data.shape[1]] = dif_data
    data_full[:,number_inputs+dif_data.shape[1]:number_inputs+dif_data.shape[1]+acc_data.shape[1]] = acc_data
    
    
    if(history_length>1):
        if(b_make_sequence):
            data_full[:,(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1]) : data_full.shape[1]] = data[:,number_inputs:data.shape[1]]         
            
            data_hist = AdmissionID_combine_rows(data_full, 0, data_full.shape[0], history_length, b_make_sequence, window_future_steps, AdmissionDataIndices)
            
        else:          
            data_hist[:,0:(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1])*history_length] = AdmissionID_combine_rows(data_full, 0, data_full.shape[0], history_length, b_make_sequence, window_future_steps, AdmissionDataIndices)
            data_hist[:,(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1])*history_length : data_hist.shape[1]] = data[:,number_inputs:data.shape[1]]
           
        data = data_hist
        del data_hist

    else:
        data_full[:,(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1]) : data_full.shape[1]] = data[:,number_inputs:data.shape[1]]
        data = data_full
        
       

    del data_full
    del dif_data
    del acc_data


    data[np.isnan(data)] = 0

    
    if(b_generate_test_data):
            
        if(b_make_sequence):
            x_train = data[:,:, 1:-num_of_labels]   
            y_train = data[:,-1, main_label_index] 
            y_train = tf.cast(y_train , tf.int32)
            if(num_prev_index>1):
                if(b_one_hot):
                    y_train= tf.one_hot(y_train,3,)
            else:
                if(b_one_hot):
                    y_train= tf.one_hot(y_train,2,)
        if(batch_size>0):
            x_train = np.array(np.array_split(x_train, int(chunksize/batch_size)))
            y_train = np.array(np.array_split(y_train, int(chunksize/batch_size)))


        else:
            x_train = data[:, 1:-num_of_labels]   
            y_train = data[:, label_index]     
            y_train = y_train.reshape(-1,1)        
        
        return data, AdmissionDataIndices, x_train, y_train 
    
    if(b_make_sequence):
        x_train = data[:,:, 1:-num_of_labels]   
        y_train = data[:,-1, main_label_index]
        y_train = tf.cast(y_train , tf.int32)
        if(num_prev_index>1):
            if(b_one_hot):
                y_train= tf.one_hot(y_train,3,)
        else:
            if(b_one_hot):
                y_train= tf.one_hot(y_train,2,)
        if(batch_size>0):
            x_train = np.array(np.array_split(x_train, int(chunksize/batch_size)))
            y_train = np.array(np.array_split(y_train, int(chunksize/batch_size)))
        
        return x_train, y_train  
        
    else:
        x_train = data[:, 1:-num_of_labels]   
        y_train = data[:, label_index]     

        return x_train, y_train.reshape(-1,1)     
In [13]:
def trim_to_batch_size(array, batch_size):
    # Calculate the number of rows to keep
    length_to_keep = len(array) // batch_size * batch_size
    
    # Trim array
    array = array[:length_to_keep]
    
    return array
In [14]:
from sklearn.model_selection import train_test_split

def create_processed_sets(data, number_inputs=40, number_of_static_inputs = 0, num_prev_index = 16, main_label_index = -6, history_length=10, b_remove_artefacts=True, b_make_sequence=False, batch_size=1024, b_generate_test_data=False, window_future_steps = 0, b_one_hot=True):
    """
    Creates processed sets of training and testing data for a given dataset.

    Args:
        data (array-like): The input dataset.
        number_inputs (int, optional): The number of input features. Defaults to 40.
        number_of_static_inputs (int, optional): The number of static input features. Defaults to 0.
        num_prev_index (int, optional): The number of previous indices to consider. Defaults to 16.
        main_label_index (int, optional): The index of the main label. Defaults to -6.
        history_length (int, optional): The history length for creating sequential data. Defaults to 10.
        b_remove_artefacts (bool, optional): If True, removes artefacts from the data. Defaults to True.
        b_make_sequence (bool, optional): If True, returns data as a sequence. If False, returns 2D array data.
                                          Defaults to False.
        batch_size (int, optional): The batch size for splitting the data into batches. Defaults to 1024.
        b_generate_test_data (bool, optional): If True, generates test data. Defaults to False.
        window_future_steps (int, optional): The number of future steps to include in each combined row.
                                             Defaults to 0.
        b_one_hot (bool, optional): If True, applies one-hot encoding to the labels. Defaults to True.

    Returns:
        tuple: The processed sets of training and testing data (x_train, y_train, x_test, y_test).

    """
    
    
    # Get unique IDs, assuming IDs are in the first column
    unique_ids = np.unique(data[:, 1])
    
    # Shuffle these IDs and perform the split
    train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42) # Adjust test_size as needed
    
    # Create the train and test arrays
    train_set = data[np.isin(data[:, 1], train_ids)]
    test_set = data[np.isin(data[:, 1], test_ids)]

    train_set = trim_to_batch_size(train_set, batch_size)
    test_set = trim_to_batch_size(test_set, batch_size)

    np.savetxt('value_vector_'+station+'_'+sign+'.txt', np.roll( test_set[:,1],-history_length+1), fmt='%d')

    x_train, y_train = process_data(train_set, data, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = num_prev_index, main_label_index = main_label_index, history_length=history_length, b_make_sequence=b_make_sequence, batch_size=batch_size, b_generate_test_data=b_generate_test_data, window_future_steps=window_future_steps, b_one_hot=b_one_hot, b_remove_artefacts=b_remove_artefacts)
    x_test, y_test = process_data(test_set, data, number_inputs=number_of_inputs, num_prev_index = num_prev_index, number_of_static_inputs=number_of_static_inputs, main_label_index = main_label_index, history_length=history_length, b_make_sequence=b_make_sequence, batch_size=batch_size, b_generate_test_data=b_generate_test_data, window_future_steps=window_future_steps, b_one_hot=b_one_hot, b_remove_artefacts=b_remove_artefacts)
    return x_train, y_train, x_test, y_test
In [15]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Bidirectional, LSTM
import numpy as np

from tensorflow.keras import backend as K




def define_model(num_neurons, history_length, input_size, number_of_outputs):
    """
    Defines a LSTM model for sequence prediction.

    Args:
        num_neurons (int): The number of neurons in the LSTM layer.
        history_length (int): The history length of the input sequences.
        input_size (int): The number of features in each input sequence.
        number_of_outputs (int): The number of output classes.

    Returns:
        Model: The defined LSTM model.

    """

    
    lstm_layer = keras.layers.LSTM(num_neurons, input_shape=(history_length, input_size))
    model = keras.models.Sequential(
        [
            lstm_layer,
            keras.layers.BatchNormalization(),
            keras.layers.Dropout(0.05),
            keras.layers.Dense(number_of_outputs, activation= 'sigmoid'), 
        ]
    )

    model.summary()
    return model
In [195]:
modelname = "model_2048_16_8_True_Ta_54"
x = modelname.split("_")
batch_size = int(x[1])
history_length = int(x[2])
NumberOfNeurons = int(x[3])
boolean = bool(x[4])
sign = x[5]
number_of_static_inputs=0
stations = ['ALL', 'OP', 'ICU']
station = stations[2]

if(boolean):
    window_future_steps = history_length 
else:
    window_future_steps = 0

number_of_inputs = 3
number_of_outputs = 1


df_sign = df.loc[df['Vitalsign'] == sign]
data_sign = create_dataset(df_sign)
x_train, y_train, X_test, y_test = create_processed_sets(data_sign, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = 0, main_label_index = -1, history_length=history_length, b_make_sequence=True, batch_size=batch_size, b_generate_test_data=False, window_future_steps=window_future_steps, b_one_hot=False, b_remove_artefacts=False)
 
model = define_model(NumberOfNeurons, history_length + window_future_steps-1, x_train.shape[3], number_of_outputs)              
optim = tf.keras.optimizers.legacy.Adam()

model.compile(
    optimizer=optim ,
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=["accuracy", tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])

                    
model.load_weights(modelname)
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 lstm_7 (LSTM)               (None, 8)                 480       
                                                                 
 batch_normalization_7 (Batc  (None, 8)                32        
 hNormalization)                                                 
                                                                 
 dropout_7 (Dropout)         (None, 8)                 0         
                                                                 
 dense_7 (Dense)             (None, 1)                 9         
                                                                 
=================================================================
Total params: 521
Trainable params: 505
Non-trainable params: 16
_________________________________________________________________
2023-08-01 15:55:47.871782: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-08-01 15:55:47.873461: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-08-01 15:55:47.874258: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_1_grad/concat/split_1/split_dim' with dtype int32
	 [[{{node gradients/split_1_grad/concat/split_1/split_dim}}]]
Out[195]:
<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x14b635fb5690>
In [196]:
'''

ls_sign = ['Ta', 'CO2']
stations = ['ALL', 'OP', 'ICU']
station = stations[0]
number_of_outputs = 1
number_of_inputs = 3 #4 for multiple features
number_of_static_inputs = 0 #1 for multiple features
b_LSTM = True
hist_lengths = [8, 16]
batch_sizes = [2048]
truefalse = [True]
number_of_neurons = [8, 16]
loss_history = []
for sign in ls_sign:
    for boolean in truefalse:
        for batch_size in batch_sizes:
            for hist in hist_lengths:
                for neurons in number_of_neurons:
                    best_loss = 10000000000
                    best_acc = 0
                    best_sens = 0
                    best_spez = 0
                    best_auc = 0
                    additional_steps = 3
                    batch_size = batch_size
                    history_length = hist
                    if(boolean):
                        window_future_steps = history_length 
                    else:
                        window_future_steps = 0

                    df_sign = df.loc[df['Vitalsign'] == sign]
                    data_sign = create_dataset(df_sign)
                    x_train, y_train, x_test, y_test = create_processed_sets(data_sign, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = 0, main_label_index = -1, history_length=history_length, b_make_sequence=True, batch_size=batch_size, b_generate_test_data=False, window_future_steps=window_future_steps, b_one_hot=False, b_remove_artefacts=False)

                    model = define_model(neurons, history_length + window_future_steps-1, x_train.shape[3], number_of_outputs)              
                    optim = tf.keras.optimizers.Adam(learning_rate=0.00005)
                    model.compile(
                        optimizer=optim ,
                        loss=tf.keras.losses.BinaryCrossentropy(),
                        metrics=["accuracy", tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])

                    

                    for episode in range(100):
                        name = "model_"+str(batch_size)+"_"+str(history_length)+"_"+str(neurons)+"_"+str(boolean)+"_"+sign+"_"+str(episode)
                        average_val_loss = 0
                        average_val_acc = 0
                        average_spez = 0
                        average_sens = 0
                        average_auc = 0
                        no_spez_counter = 0
                        no_sens_counter = 0
                        no_auc_counter = 0
                        num_of_steps = x_train.shape[0]
                        for i in range(x_train.shape[0]):
                            #model.fit(x_train[i,:,:,:], y_train[i,:,:])
                            model.fit(x_train[i,:,:,:], y_train[i,:])
                        print('\n')
                        print('\n')
                        print('Validation: ')
                        print('--------------------------------------------------')
                        print('\n')
                        num_of_steps = x_test.shape[0]
                        for i in range(num_of_steps):

                            #loss, acc, spez, sens, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:,:])
                            loss, acc, tp, tn, fp, fn, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:])
                            average_val_loss += loss
                            average_val_acc += acc


                            if((tp + fn)== 0 or (tp + fp)== 0 or (tn + fp)== 0 or (tn + fn)== 0):
                                no_auc_counter += 1 
                            else:
                                average_auc += auc

                            if((tp + fn)== 0):
                                no_sens_counter += 1
                            else:
                                average_sens += (tp / (tp + fn))  

                            if((tn + fp)== 0):
                                no_spez_counter += 1
                            else: 
                                average_spez += (tn / (tn + fp))
                        print('\n')
                        print('\n')
                        print('New best results: ')
                        print('--------------------------------------------------')
                        print('\n')
        

                        if(average_val_loss/num_of_steps>best_loss and average_val_acc/num_of_steps<best_acc):
                            additional_steps -= 1
                            if(additional_steps==0):
                                break
                        if(average_val_loss/num_of_steps<best_loss):
                            best_loss = average_val_loss/num_of_steps
                            print('New best loss: ', best_loss)
                            additional_steps = 10
                        if(average_val_acc/num_of_steps>best_acc):
                            best_acc = average_val_acc/num_of_steps
                            print('New best accuarcy: ', best_acc)
                            additional_steps = 10
                        if(average_sens/(num_of_steps-no_sens_counter)>best_sens):
                            best_sens = average_sens/(num_of_steps-no_sens_counter)
                            print('New best Sensitivity: ', best_sens)
                        if(average_spez/(num_of_steps-no_spez_counter)>best_spez):
                            best_spez = average_spez/(num_of_steps-no_spez_counter)
                            print('New best Specificity ', best_spez)
                        if(average_auc/(num_of_steps-no_auc_counter+0.000001)>best_auc):
                            best_auc = average_auc/(num_of_steps-no_auc_counter+0.000001)
                            print('New best AOC: ', best_auc)
                            additional_steps = 10
                        loss_history.append([average_val_loss/num_of_steps, average_val_acc/num_of_steps, average_spez/(num_of_steps-no_spez_counter), average_sens/(num_of_steps-no_sens_counter), average_auc/(num_of_steps-no_auc_counter+0.000001), name])
                        with open("loss_history", 'w') as file:
                            for item in loss_history:         
                                for word in item:
                                    file.write(str(word) + '\t')
                                file.write('\n')
                        model.save_weights(name)
                        print('\n')
                        print('Number of tries left:', additional_steps)
                        print('--------------------------------------------------')
                        print('\n')
'''
Out[196]:
'\n\nls_sign = [\'Ta\', \'CO2\']\nstations = [\'ALL\', \'OP\', \'ICU\']\nstation = stations[0]\nnumber_of_outputs = 1\nnumber_of_inputs = 3 #4 for multiple features\nnumber_of_static_inputs = 0 #1 for multiple features\nb_LSTM = True\nhist_lengths = [8, 16]\nbatch_sizes = [2048]\ntruefalse = [True]\nnumber_of_neurons = [8, 16]\nloss_history = []\nfor sign in ls_sign:\n    for boolean in truefalse:\n        for batch_size in batch_sizes:\n            for hist in hist_lengths:\n                for neurons in number_of_neurons:\n                    best_loss = 10000000000\n                    best_acc = 0\n                    best_sens = 0\n                    best_spez = 0\n                    best_auc = 0\n                    additional_steps = 3\n                    batch_size = batch_size\n                    history_length = hist\n                    if(boolean):\n                        window_future_steps = history_length \n                    else:\n                        window_future_steps = 0\n\n                    df_sign = df.loc[df[\'Vitalsign\'] == sign]\n                    data_sign = create_dataset(df_sign)\n                    x_train, y_train, x_test, y_test = create_processed_sets(data_sign, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = 0, main_label_index = -1, history_length=history_length, b_make_sequence=True, batch_size=batch_size, b_generate_test_data=False, window_future_steps=window_future_steps, b_one_hot=False, b_remove_artefacts=False)\n\n                    model = define_model(neurons, history_length + window_future_steps-1, x_train.shape[3], number_of_outputs)              \n                    optim = tf.keras.optimizers.Adam(learning_rate=0.00005)\n                    model.compile(\n                        optimizer=optim ,\n                        loss=tf.keras.losses.BinaryCrossentropy(),\n                        metrics=["accuracy", tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])\n\n                    \n\n                    for episode in range(100):\n                        name = "model_"+str(batch_size)+"_"+str(history_length)+"_"+str(neurons)+"_"+str(boolean)+"_"+sign+"_"+str(episode)\n                        average_val_loss = 0\n                        average_val_acc = 0\n                        average_spez = 0\n                        average_sens = 0\n                        average_auc = 0\n                        no_spez_counter = 0\n                        no_sens_counter = 0\n                        no_auc_counter = 0\n                        num_of_steps = x_train.shape[0]\n                        for i in range(x_train.shape[0]):\n                            #model.fit(x_train[i,:,:,:], y_train[i,:,:])\n                            model.fit(x_train[i,:,:,:], y_train[i,:])\n                        print(\'\n\')\n                        print(\'\n\')\n                        print(\'Validation: \')\n                        print(\'--------------------------------------------------\')\n                        print(\'\n\')\n                        num_of_steps = x_test.shape[0]\n                        for i in range(num_of_steps):\n\n                            #loss, acc, spez, sens, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:,:])\n                            loss, acc, tp, tn, fp, fn, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:])\n                            average_val_loss += loss\n                            average_val_acc += acc\n\n\n                            if((tp + fn)== 0 or (tp + fp)== 0 or (tn + fp)== 0 or (tn + fn)== 0):\n                                no_auc_counter += 1 \n                            else:\n                                average_auc += auc\n\n                            if((tp + fn)== 0):\n                                no_sens_counter += 1\n                            else:\n                                average_sens += (tp / (tp + fn))  \n\n                            if((tn + fp)== 0):\n                                no_spez_counter += 1\n                            else: \n                                average_spez += (tn / (tn + fp))\n                        print(\'\n\')\n                        print(\'\n\')\n                        print(\'New best results: \')\n                        print(\'--------------------------------------------------\')\n                        print(\'\n\')\n        \n\n                        if(average_val_loss/num_of_steps>best_loss and average_val_acc/num_of_steps<best_acc):\n                            additional_steps -= 1\n                            if(additional_steps==0):\n                                break\n                        if(average_val_loss/num_of_steps<best_loss):\n                            best_loss = average_val_loss/num_of_steps\n                            print(\'New best loss: \', best_loss)\n                            additional_steps = 10\n                        if(average_val_acc/num_of_steps>best_acc):\n                            best_acc = average_val_acc/num_of_steps\n                            print(\'New best accuarcy: \', best_acc)\n                            additional_steps = 10\n                        if(average_sens/(num_of_steps-no_sens_counter)>best_sens):\n                            best_sens = average_sens/(num_of_steps-no_sens_counter)\n                            print(\'New best Sensitivity: \', best_sens)\n                        if(average_spez/(num_of_steps-no_spez_counter)>best_spez):\n                            best_spez = average_spez/(num_of_steps-no_spez_counter)\n                            print(\'New best Specificity \', best_spez)\n                        if(average_auc/(num_of_steps-no_auc_counter+0.000001)>best_auc):\n                            best_auc = average_auc/(num_of_steps-no_auc_counter+0.000001)\n                            print(\'New best AOC: \', best_auc)\n                            additional_steps = 10\n                        loss_history.append([average_val_loss/num_of_steps, average_val_acc/num_of_steps, average_spez/(num_of_steps-no_spez_counter), average_sens/(num_of_steps-no_sens_counter), average_auc/(num_of_steps-no_auc_counter+0.000001), name])\n                        with open("loss_history", \'w\') as file:\n                            for item in loss_history:         \n                                for word in item:\n                                    file.write(str(word) + \'\t\')\n                                file.write(\'\n\')\n                        model.save_weights(name)\n                        print(\'\n\')\n                        print(\'Number of tries left:\', additional_steps)\n                        print(\'--------------------------------------------------\')\n                        print(\'\n\')\n'
In [197]:
AdmissionDataIndices = []
admission_id = 1

for i, ele in enumerate(data_sign[:,0]):
    if(ele != admission_id):
        admission_id = ele
        AdmissionDataIndices.append(i)
In [198]:
try:
    X_test = x_test
except:
    pass
try:
    X_train = x_train
except:
    pass
In [199]:
neg_neg = 0 
neg_pos = 0 
pos_neg = 0 
pos_pos = 0
counter = 0 
X_train_len = X_train.shape[0]*X_train.shape[1]
    

for batch_index in range(X_test.shape[0]):

    label_curve = [] 
    predicted_curve = [] 
    main_curve = []
    prob_pos = []
    preds = model.predict(X_test[batch_index,:,:,:])
    for i in range(X_test.shape[1]):
        patient_id = df_sign.iloc[i+X_train_len+batch_index*batch_size,1]
        if(get_group(patient_id, icu)==station or station == 'ALL'):
            pred = preds[i]
            label = y_test[batch_index,i]
            main_curve.append(X_test[batch_index, i, -1 ,0])
            label_curve.append(label)
    
    
    
    
            if(label>0):
                counter+=1
    
            if(label==0):
                if(pred<0.5):
                    neg_neg+=1
                else:
                    neg_pos+=1
    
            else:
                if(pred<0.5):
                    pos_neg+=1
                else:
                    pos_pos+=1
    
            prob_pos.append(preds.flatten())
            predicted_curve.append(pred)
sum_neg = neg_neg + neg_pos
sum_pos = pos_neg + pos_pos

print(neg_neg, neg_pos, pos_neg, pos_pos) 
print(neg_neg/sum_neg, pos_pos/sum_pos) 
print((neg_neg+ pos_pos) / ((sum_neg+ sum_pos)))
64/64 [==============================] - 0s 1ms/step
2023-08-01 15:55:48.154635: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-08-01 15:55:48.156048: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-08-01 15:55:48.156960: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_1_grad/concat/split_1/split_dim' with dtype int32
	 [[{{node gradients/split_1_grad/concat/split_1/split_dim}}]]
1926 0 7 11
1.0 0.6111111111111112
0.9963991769547325
In [200]:
X_test.shape
Out[200]:
(1, 2048, 31, 6)
In [201]:
y_test = y_test.flatten()
In [202]:
y_test.shape
Out[202]:
(2048,)
In [203]:
prob_pos = []
for batch_index in range(X_test.shape[0]):
    prob_pos.append(model.predict(X_test[batch_index,:,:,:]))

prob_pos_train = []
for batch_index in range(x_train.shape[0]):
    prob_pos_train.append(model.predict(x_train[batch_index,:,:,:]))
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
64/64 [==============================] - 0s 1ms/step
In [204]:
prob_pos = np.array(prob_pos).flatten()
In [205]:
prob_pos_train = np.array(prob_pos_train).flatten()
In [206]:
real_values = np.loadtxt('value_vector_'+station+'_'+sign+'.txt', dtype=int)
In [207]:
import matplotlib.pyplot as plt
import numpy as np

# Create a color array based on TP, FP, TN, FN
colors = []

for i in range(len(y_test)):
    label = y_test[i]
    pred = prob_pos[i]
    
    if label > 0 and pred >= 0.5:  # TP: Actual positive and predicted as positive
        colors.append('green')
    elif label == 0 and pred < 0.5:  # TN: Actual negative and predicted as negative
        colors.append('blue')
    elif label == 0 and pred >= 0.5:  # FP: Actual negative but predicted as positive
        colors.append('red')
    else:  # FN: Actual positive but predicted as negative
        colors.append('orange')

# Convert colors to a numpy array
colors = np.array(colors)

# Create a scatter plot of main_curve with color-coding based on prediction correctness
plt.scatter(range(len(real_values)), real_values, c=colors, s=15)  # s parameter controls the size of dots
plt.plot(range(len(real_values)), real_values, color='black', linewidth=0.5)  # plot the line

# Title and labels
plt.title('Example Signal with Prediction Outcomes')
plt.xlabel('Time Step (index)')
plt.ylabel('Example Signal')

# Setting the x limit
plt.xlim([1200, 1600])

# Legend
plt.scatter([], [], color='green', label='True Positive', s=15)
plt.scatter([], [], color='blue', label='True Negative', s=15)
plt.scatter([], [], color='red', label='False Positive', s=15)
plt.scatter([], [], color='orange', label='False Negative', s=15)
plt.legend()
plt.savefig('Example_plot_'+station+'_'+sign+'.png')

plt.show()
In [208]:
from sklearn.linear_model import LogisticRegression


raw_probabilities = prob_pos_train.reshape(-1, 1)
# Fit the Logistic Regression to calibrate the probabilities
lr = LogisticRegression().fit(raw_probabilities, y_train.flatten())
In [209]:
calibrated_probabilities = lr.predict_proba(prob_pos.reshape(-1, 1))  # Calibrate probabilities
In [210]:
prob_pos = calibrated_probabilities
In [211]:
prob_pos = prob_pos[:,1]
In [212]:
print(prob_pos)
[0.00331583 0.00331507 0.00331439 ... 0.00332162 0.00332071 0.00331997]
In [213]:
np.savetxt('probability_vector_'+station+'_'+sign+'.txt', prob_pos, fmt='%f')
In [214]:
np.savetxt('label_vector_'+station+'_'+sign+'.txt', y_test, fmt='%d')
In [ ]:
 
In [215]:
y_pred = (prob_pos > 0.5).astype(int)
In [216]:
import seaborn as sns
sns.set_style("white")



from sklearn.calibration import calibration_curve

fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10)

from sklearn.metrics import roc_curve, roc_auc_score

# Calculate the ROC curve points
fpr, tpr, thresholds = roc_curve(y_test, prob_pos)

# Calculate the AUROC score
auroc = roc_auc_score(y_test, prob_pos)
In [217]:
# Figure 1 for Calibration curve and Histogram
fig1 = plt.figure(figsize=(10, 8)) # Set the overall figure size

gs1 = plt.GridSpec(2, 1, height_ratios=[1, 1]) # Set height ratios to 1:1 for equal vertical size

# Calibration curve
ax1 = plt.subplot(gs1[0])
ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")  
ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % 'LSTM-Classifer', color='blue')
ax1.set_xlim([0, 1]) # Set x range to 0 to 1
ax1.set_ylim([0, 1]) # Set y range to 0 to 1
ax1.set_ylabel("Fraction of positives")
ax1.set_title('Calibration plot of LSTM classifer for '+sign+ ' ('+station+')')
ax1.legend(loc="upper left")
ax1.xaxis.tick_top() # Move x-axis labels to top
ax1.xaxis.set_label_position('top') # Move x-axis label to top

# Position legend of plot 1 on x-axis
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels,  ncol=2)

# Histogram as an "inset" plot
ax2 = plt.subplot(gs1[1], sharex=ax1) # Sharing x-axis with first plot
hist, bins = np.histogram(prob_pos, bins=10, range=(0, 1))
width = (bins[1] - bins[0]) * 0.8
centers = (bins[:-1] + bins[1:]) / 2
ax2.bar(centers, hist, align='center', width=width, alpha=0.5, color='green', label='Histogram')
ax2.set_xlabel("Predicted probability") # Add x-axis label
ax2.set_ylabel("Number of cases")

ax2.yaxis.set_major_locator(plt.MaxNLocator(5))
plt.subplots_adjust(hspace=0)  

plt.savefig('Calibration_Histogram_'+station+'_'+sign+'.png')
plt.show()
In [218]:
import numpy as np
from sklearn.metrics import roc_auc_score, recall_score, precision_score, accuracy_score
from sklearn.utils import resample

# Define the number of bootstrap samples
n_bootstrap = 2000

# Create empty lists to store bootstrapped metrics
sensitivity_values = []
specificity_values = []
ppv_values = []
npv_values = []
auroc_values = []
accuracy_values = []

# Perform bootstrap resampling
for _ in range(n_bootstrap):
    # Sample with replacement from the predictions
    indices = resample(range(len(y_pred)), replace=True)
    bootstrap_y_test = y_test[indices]
    bootstrap_y_pred = y_pred[indices]
    bootstrap_prob_pos = prob_pos[indices]

    # Calculate metrics for the bootstrap sample
    sensitivity = recall_score(bootstrap_y_test, bootstrap_y_pred)
    specificity = recall_score(bootstrap_y_test, bootstrap_y_pred, pos_label=0)
    ppv = precision_score(bootstrap_y_test, bootstrap_y_pred)
    npv = precision_score(bootstrap_y_test, bootstrap_y_pred, pos_label=0)
    auroc = roc_auc_score(bootstrap_y_test, bootstrap_prob_pos)
    accuracy = accuracy_score(bootstrap_y_test, bootstrap_y_pred)

    # Append the metric values to the lists
    sensitivity_values.append(sensitivity)
    specificity_values.append(specificity)
    ppv_values.append(ppv)
    npv_values.append(npv)
    auroc_values.append(auroc)
    accuracy_values.append(accuracy)

# Calculate confidence intervals
confidence_interval = 0.95
alpha = (1 - confidence_interval) / 2
lower_percentile = alpha * 100
upper_percentile = (1 - alpha) * 100

# Calculate confidence intervals for each metric
sensitivity_ci = np.percentile(sensitivity_values, [lower_percentile, upper_percentile])
specificity_ci = np.percentile(specificity_values, [lower_percentile, upper_percentile])
ppv_ci = np.percentile(ppv_values, [lower_percentile, upper_percentile])
npv_ci = np.percentile(npv_values, [lower_percentile, upper_percentile])
auroc_ci = np.percentile(auroc_values, [lower_percentile, upper_percentile])
accuracy_ci = np.percentile(accuracy_values, [lower_percentile, upper_percentile])


with open("metrics_"+station+"_"+sign+".txt", "w") as file:
    file.write("Accuracy: {:.3f} ({:.3f}-{:.3f})\n".format(np.mean(accuracy_values), accuracy_ci[0], accuracy_ci[1]))
    file.write("Sensitivity: {:.3f} ({:.3f}-{:.3f})\n".format(sensitivity, sensitivity_ci[0], sensitivity_ci[1]))
    file.write("Specificity: {:.3f} ({:.3f}-{:.3f})\n".format(specificity, specificity_ci[0], specificity_ci[1]))
    file.write("Positive Predictive Value (PPV): {:.3f} ({:.3f}-{:.3f})\n".format(ppv, ppv_ci[0], ppv_ci[1]))
    file.write("Negative Predictive Value (NPV): {:.3f} ({:.3f}-{:.3f})\n".format(npv, npv_ci[0], npv_ci[1]))
    file.write("Area Under the ROC Curve (AUROC): {:.3f} ({:.3f}-{:.3f})".format(auroc, auroc_ci[0], auroc_ci[1]))
In [219]:
from sklearn.metrics import roc_curve, auc
from numpy import interp

n_bootstraps = 2000
rng_seed = 42  # control reproducibility
bootstrapped_tprs = []
bootstrapped_auc = []

base_fpr = np.linspace(0, 1, 101)  # Fix the FPR values for interpolation

rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
    # bootstrap by sampling with replacement on the prediction indices
    indices = rng.randint(0, len(prob_pos), len(prob_pos))
    if len(np.unique(y_test[indices])) < 2:
        # We need at least one positive and one negative sample for ROC AUC
        # to be defined: reject the sample
        continue

    fpr_boot, tpr_boot, _ = roc_curve(y_test[indices], prob_pos[indices])
    bootstrapped_auc.append(auc(fpr_boot, tpr_boot))

    # Interpolate TPR for the fixed FPR values
    tpr_boot_interp = interp(base_fpr, fpr_boot, tpr_boot)
    bootstrapped_tprs.append(tpr_boot_interp)

# Convert list to a 2D array
bootstrapped_tprs = np.array(bootstrapped_tprs)

# Compute the lower and upper bound of the ROC curve
tprs_lower = np.percentile(bootstrapped_tprs, 2.5, axis=0)
tprs_upper = np.percentile(bootstrapped_tprs, 97.5, axis=0)

# Compute the lower and upper bound of the ROC AUC
auc_lower = np.percentile(bootstrapped_auc, 2.5)
auc_upper = np.percentile(bootstrapped_auc, 97.5)

print("Confidence interval for the ROC curve: [{:0.3f} - {:0.3}]".format(auc_lower, auc_upper))
Confidence interval for the ROC curve: [0.986 - 0.999]
In [220]:
fig2 = plt.figure(figsize=(10, 6))

specificity = 1 - base_fpr  # Calculate Specificity (1 - False Positive Rate)

ax3 = plt.subplot(1, 1, 1)
ax3.plot([1, 0], [0, 1], 'k--')  # Invert x-axis
ax3.plot(specificity, np.median(bootstrapped_tprs, axis=0), label=r'ROC (AUC = %0.2f $\pm$ %0.2f)' % (auroc, (auc_upper-auc_lower)/2), color='blue')
ax3.fill_between(specificity, tprs_lower, tprs_upper, color='grey', alpha=0.2)
ax3.set_xlabel('Specificity (True Negative Rate)')
ax3.set_ylabel('Sensitivity (True Positive Rate)')
ax3.set_title('Receiver Operating Characteristic for '+sign+ ' ('+station+')')
ax3.legend(loc='lower right')
ax3.set_xlim([1, 0])  # Invert x-axis limits
ax3.set_ylim([0, 1])
ax3.set_xticks([1,0.8,0.6,0.4,0.2,0])  # Set x-axis tick locations
ax3.set_xticklabels([1,0.8,0.6,0.4,0.2,0])  # Set x-axis tick labels




plt.tight_layout()
plt.savefig('ROC_Curve_'+station+'_'+sign+'.png')
plt.show()