import numpy as np
import scipy.stats as stats
import pandas as pd
class artefact_recognition:
def iqr(df, column_list, factor = 3):
""" Executes artefact recognition by Interqurtile Range
INPUT:
df: df of vitals for one patient
column_list: list of columns to remove artefacts from.
factor: factor of iqr above data will be dropped - default is 3
OUTPUT:
cleaned df."""
for i in column_list:
df = df.drop(df[df[i] < (np.percentile(df[i], 25) - factor * stats.iqr(df[i]))].index)
df = df.drop(df[df[i] > (np.percentile(df[i], 75) + factor * stats.iqr(df[i]))].index)
return df
def z_value(df, column_list):
""" Executes artefact recognition by z-value
INPUT:
df: df of vitals for one patient
column_list: list of columns to remove artefacts from.
OUTPUT:
cleaned df."""
z_scores = stats.zscore(df[column_list])
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
return df[filtered_entries]
cut_off_df = pd.DataFrame(data = [[20, 300], [10,250], [5, 225], [20, 300], [10,250], [5, 225], [5,150], [0,100],[25,45], [25,45], [5,300], [5,150]], columns = ['mi', 'ma'], index = ['sys', 'map', 'dia', 'n_sys', 'n_map', 'n_dia', 'etCO2', 'SpO2', 'Ta', 'Tb', 'HF', 'CO2'])
def cut_off(df, column_list, cut_offs = cut_off_df):
""" Executes artefact recognition by applying pre-defined Cut-offs
INPUT:
df: df of vitals for one patient
column_list: list of columns to remove artefacts from.
cut_offs: dataframe with names of columns and each min (mi) and max(ma) -default is a predefined df for sys, map, dia, etco2, spo2, ta, tb, hf
OUTPUT:
cleaned df."""
for i in column_list:
df = df.drop(df[df[i] < cut_offs.loc[i]['mi']].index)
df = df.drop(df[df[i] > cut_offs.loc[i]['ma']].index)
return df
def lrds_(dataset, cols, neighbors, distances):
lrds = []
for A, *Nk in neighbors:
dis = distances[A]
xA, yA = dataset.loc[A, cols]
sum_reach_distance = 0
for B in Nk:
if B < len(dataset):
k_distance_B = dataset.loc[B, 'k_distance_B']
xB, yB = dataset.loc[B, cols]
dAB = distance.euclidean([xA, yA], [xB, yB])
reach_distance_AB = max(k_distance_B, dAB)
sum_reach_distance += reach_distance_AB
if(len(Nk) == 0):
print('zero entry neighbors list')
if(sum_reach_distance == 0):
sum_reach_distance = 1e-9
lrd_A = 1/(sum_reach_distance/len(Nk))
lrds.append(lrd_A)
return lrds
def lofs_(lrds, neighbors, i):
lof = []
if(len(lrds) < len(neighbors[0])):
return lrds
for A, *Nk in neighbors:
tmp = []
for B in Nk:
try:
tmp.append(lrds[B])
except IndexError:
print(lrds)
print(len(neighbors))
print(B)
sum_lrds = sum(tmp)
lof.append(sum_lrds/(len(Nk) * lrds[A]))
return lof
from scipy.spatial import KDTree, distance
def lof(df, column_list, max_lof_threshold=1.5):
x = df.copy()
x.reset_index(inplace=True)
x['seconds'] = (x['measurement_datetime'].astype('int64') // 1e9).astype('int64')
# column = 'dia'
k = 7
lofs = []
for column in column_list:
kd = KDTree(x[['seconds', column]])
# get the 7 nearest neigbors and their distance to each and every point
kdquery = kd.query(x[['seconds', column]], k=k)
distances, neighbors = kdquery
# add the k-distance to B to the df
x['k_distance_B'] = distances[:, -1]
lrds = lrds_(x, ['seconds', column], neighbors, distances)
lofs.append(lofs_(lrds, neighbors, 0))
return x[(np.array(lofs) < max_lof_threshold).any(axis = 0)]
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
# Loading Dataset with pandas
df = pd.read_csv('vitalsign_dataset.csv')
2023-08-01 15:38:46.000211: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2023-08-01 15:38:46.798473: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
objects = pd.read_pickle("patient_list.pkl")
icu = np.asarray(objects)
def get_group(patient_id, icu):
for i, pat_id in enumerate(icu[:,5]):
"""
Retrieves the group of a patient from the ICU based on their ID.
Args:
patient_id (int): The ID of the patient to search for.
icu (array-like): The ICU data structure containing patient information.
Returns:
str or None: The group of the patient if found, or None if the patient is not in the ICU.
"""
if(int(patient_id)==int(pat_id)):
return icu[i,1]
return None
df["artefact"] = df["artefact"].astype(int)
features = df.columns
print(features)
Index(['measurement_id', 'person_id', 'measurement_concept_id', 'value_as_number', 'measurement_datetime', 'Vitalsign', 'artefact'], dtype='object')
def combine_rows(data, start=0, end=100, window_size=2, b_make_sequence=False, window_future_steps = 0):
"""
Combines rows of data into a single array with specified window size.
Args:
data (array-like): The input data to be combined.
start (int, optional): The starting index for combining rows. Defaults to 0.
end (int, optional): The ending index (exclusive) for combining rows. Defaults to 100.
window_size (int, optional): The size of the sliding window. Defaults to 2.
b_make_sequence (bool, optional): If True, the function returns a sequence of combined rows.
If False, it returns a 2D array with combined rows.
Defaults to False.
window_future_steps (int, optional): The number of future steps to include in each combined row.
Defaults to 0.
Returns:
array-like: The combined rows of data.
"""
if(b_make_sequence):
main_data = []
else:
main_data = np.zeros((end-start, data.shape[1]*window_size))
size = data.shape[1]
for i in range(start, end):
input_data = []
index = 0;
for j in range(i - window_size+1 , i+window_future_steps):
if(not (j<0 or j<start or j>=end)):
if(b_make_sequence):
input_data.append(data[j,:])
else:
main_data[i, size*index:size*index+size] = data[j,:]
else:
if(b_make_sequence):
input_data.append(np.zeros(data[0,:].shape))
index+=1
main_data.append(input_data)
if(b_make_sequence):
return np.array(main_data).astype(float)
else:
return main_data.astype(float)
from sklearn.model_selection import train_test_split
def create_dataset(df):
df = df.drop(columns=['measurement_id'], axis=0)
df = df.drop(columns=['measurement_concept_id'], axis=0)
df["measurement_datetime"] = pd.to_datetime(df["measurement_datetime"]).astype('int64')/ 10**9
df = df.sort_values(['person_id', 'measurement_datetime']).reset_index(drop=True)
data = df.to_numpy()
#this was a try to train a model for all artifacts, we use onehot later anyway
data = np.where(data=='HF', -0.75, data)
data = np.where(data=='map', -0.5, data)
data = np.where(data=='dia', -0.25, data)
data = np.where(data=='sys', 0., data)
data = np.where(data=='SpO2', 0.25, data)
data = np.where(data=='CO2', 0.5, data)
data = np.where(data=='Ta', 0.75, data)
return data.astype(float)
def AdmissionID_combine_rows(data, start=0, end=100, window_size=2, b_make_sequence=False, window_future_steps = 0, AdmissionIDs = []):
"""
Combines rows of data based on specified AdmissionIDs into a single array with a specified window size.
Args:
data (array-like): The input data to be combined.
start (int, optional): The starting index for combining rows. Defaults to 0.
end (int, optional): The ending index (exclusive) for combining rows. Defaults to 100.
window_size (int, optional): The size of the sliding window. Defaults to 2.
b_make_sequence (bool, optional): If True, the function returns a sequence of combined rows.
If False, it returns a 2D array with combined rows.
Defaults to False.
window_future_steps (int, optional): The number of future steps to include in each combined row.
Defaults to 0.
AdmissionIDs (list, optional): A list of AdmissionIDs indicating the sections of data to combine.
Defaults to an empty list.
Returns:
array-like: The combined rows of data.
"""
if(len(AdmissionIDs)>0):
main_data = combine_rows(data, AdmissionIDs[0], AdmissionIDs[1], window_size, b_make_sequence, window_future_steps)
for i in range(1, len(AdmissionIDs)-1):
main_data =np.append(main_data, combine_rows(data, AdmissionIDs[i], AdmissionIDs[i+1], window_size, b_make_sequence, window_future_steps), axis=0)
main_data =np.append(main_data, combine_rows(data, AdmissionIDs[i+1], end, window_size, b_make_sequence, window_future_steps), axis=0)
return main_data
else:
return _
def process_data(data_np, full_data, number_inputs=40, number_of_static_inputs = 0, num_prev_index = 16, main_label_index = -6, history_length=10, b_remove_artefacts=True, b_make_sequence=False, batch_size=1024, b_generate_test_data=False, window_future_steps = 0, b_one_hot=True):
"""
Processes the data by applying various transformations and returns the preprocessed data.
Args:
data_np (array-like): The input data to be processed.
full_data (array-like): The full dataset for computing statistics.
number_inputs (int, optional): The number of input features. Defaults to 40.
number_of_static_inputs (int, optional): The number of static input features. Defaults to 0.
num_prev_index (int, optional): The number of previous indices to consider. Defaults to 16.
main_label_index (int, optional): The index of the main label. Defaults to -6.
history_length (int, optional): The history length for creating sequential data. Defaults to 10.
b_remove_artefacts (bool, optional): If True, removes artefacts from the data. Defaults to True.
b_make_sequence (bool, optional): If True, returns data as a sequence. If False, returns 2D array data.
Defaults to False.
batch_size (int, optional): The batch size for splitting the data into batches. Defaults to 1024.
b_generate_test_data (bool, optional): If True, generates test data. Defaults to False.
window_future_steps (int, optional): The number of future steps to include in each combined row.
Defaults to 0.
b_one_hot (bool, optional): If True, applies one-hot encoding to the labels. Defaults to True.
Returns:
tuple or array-like: The preprocessed data, AdmissionDataIndices, x_train, and y_train if b_generate_test_data
is True. Otherwise, x_train and y_train.
"""
if(history_length<1):
history_length = 1
data = data_np
chunksize = data.shape[0]
AdmissionDataIndices = []
admission_id = 1
data_nan = np.zeros(data.shape)
for i in range(len(data)):
for j in range(len(data[0])):
if(np.isnan(data[i,j])):
data[i,j] = 0
data_nan[i,j] = 1
num_of_labels = data.shape[1]-number_inputs
max_values = []
min_values = []
mean_values = []
std_values = []
for i in range(len(data[0])):
max_values.append(np.max(full_data[:,i]))
min_values.append(np.min(full_data[:,i]))
mean_values.append(np.mean(full_data[:,i]))
std_values.append(np.std(full_data[:,i]))
if(b_remove_artefacts):
for j in range(len(full_data)):
if((full_data[j,i]>(mean_values[i]+7*std_values[i])) or (full_data[j,i]<(mean_values[i]-7*std_values[i]))):
print('Artefact')
full_data[j,i] = 0
if(j<data.shape[0] and i<data.shape[1]):
data[j,i] = 0
data_nan[j,i] = 1
if(b_remove_artefacts):
max_values.append(np.max(full_data[:,i]))
min_values.append(np.min(full_data[:,i]))
mean_values.append(np.mean(full_data[:,i]))
std_values.append(np.std(full_data[:,i]))
abs_max_values = []
mean_values = []
for i in range(0,len(data[0])):
abs_max_values.append(max(np.max(data[:,i]), abs(np.min(data[:,i]))))
mean_values.append(np.mean(data[:,i]))
if(i<data.shape[1]-num_of_labels-number_of_static_inputs):
if(abs_max_values[i]==0):
data[:,i] = (data[:,i])
else:
data[:,i] = (data[:,i]-mean_values[i])/(abs_max_values[i]+1E-7)
for i in range(len(data)):
for j in range(len(data[0])):
if(data_nan[i,j] == 1):
data[i,j] = 0
dif_data = np.diff(data[:,1:number_inputs-number_of_static_inputs], axis=0)
dif_data = np.append(dif_data, np.zeros((1,dif_data.shape[1])), axis=0)
acc_data = np.diff(dif_data, axis=0)
acc_data = np.append(acc_data, np.zeros((1,acc_data.shape[1])), axis=0)
for i, ele in enumerate(data[:,0]):
if(ele != admission_id):
admission_id = ele
AdmissionDataIndices.append(i)
dif_data[i,:] = np.zeros_like(dif_data[i,:])
acc_data[i,:] = np.zeros_like(acc_data[i,:])
j=0
prev_index = num_prev_index
label_index = main_label_index
current_id = AdmissionDataIndices[0]
hypo_save = np.copy(data[:,label_index])
hypo_count = 0
if(num_prev_index>1):
for i in range(len(data)-prev_index):
if(i> AdmissionDataIndices[j]):
if(j<len(AdmissionDataIndices)-1):
j+=1
if((i+prev_index)<AdmissionDataIndices[j]):
if(hypo_save[i]==1):
data[i,label_index ] = 2
hypo_count +=1
for x in range(i-prev_index,i):
if(hypo_save[x]!=1 and x>AdmissionDataIndices[j-1]):
data[x,label_index ] = 1
if(history_length>1):
if(b_make_sequence):
data_full = np.zeros((data.shape[0], data.shape[1]+dif_data.shape[1]+acc_data.shape[1]))
else:
data_hist = np.zeros((data.shape[0], (data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1])*history_length+ num_of_labels))
data_full = np.zeros((data.shape[0], data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1]))
else:
#if(b_make_sequence):s
# data_full = np.zeros((data.shape[0], data.shape[1]+dif_data.shape[1]+acc_data.shape[1]))
#else:
data_full = np.zeros((data.shape[0], data.shape[1]+dif_data.shape[1]+acc_data.shape[1]))
data_full[:,0:number_inputs] = data[:,0:number_inputs]
data_full[:,number_inputs:number_inputs+dif_data.shape[1]] = dif_data
data_full[:,number_inputs+dif_data.shape[1]:number_inputs+dif_data.shape[1]+acc_data.shape[1]] = acc_data
if(history_length>1):
if(b_make_sequence):
data_full[:,(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1]) : data_full.shape[1]] = data[:,number_inputs:data.shape[1]]
data_hist = AdmissionID_combine_rows(data_full, 0, data_full.shape[0], history_length, b_make_sequence, window_future_steps, AdmissionDataIndices)
else:
data_hist[:,0:(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1])*history_length] = AdmissionID_combine_rows(data_full, 0, data_full.shape[0], history_length, b_make_sequence, window_future_steps, AdmissionDataIndices)
data_hist[:,(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1])*history_length : data_hist.shape[1]] = data[:,number_inputs:data.shape[1]]
data = data_hist
del data_hist
else:
data_full[:,(data.shape[1]-num_of_labels+dif_data.shape[1]+acc_data.shape[1]) : data_full.shape[1]] = data[:,number_inputs:data.shape[1]]
data = data_full
del data_full
del dif_data
del acc_data
data[np.isnan(data)] = 0
if(b_generate_test_data):
if(b_make_sequence):
x_train = data[:,:, 1:-num_of_labels]
y_train = data[:,-1, main_label_index]
y_train = tf.cast(y_train , tf.int32)
if(num_prev_index>1):
if(b_one_hot):
y_train= tf.one_hot(y_train,3,)
else:
if(b_one_hot):
y_train= tf.one_hot(y_train,2,)
if(batch_size>0):
x_train = np.array(np.array_split(x_train, int(chunksize/batch_size)))
y_train = np.array(np.array_split(y_train, int(chunksize/batch_size)))
else:
x_train = data[:, 1:-num_of_labels]
y_train = data[:, label_index]
y_train = y_train.reshape(-1,1)
return data, AdmissionDataIndices, x_train, y_train
if(b_make_sequence):
x_train = data[:,:, 1:-num_of_labels]
y_train = data[:,-1, main_label_index]
y_train = tf.cast(y_train , tf.int32)
if(num_prev_index>1):
if(b_one_hot):
y_train= tf.one_hot(y_train,3,)
else:
if(b_one_hot):
y_train= tf.one_hot(y_train,2,)
if(batch_size>0):
x_train = np.array(np.array_split(x_train, int(chunksize/batch_size)))
y_train = np.array(np.array_split(y_train, int(chunksize/batch_size)))
return x_train, y_train
else:
x_train = data[:, 1:-num_of_labels]
y_train = data[:, label_index]
return x_train, y_train.reshape(-1,1)
def trim_to_batch_size(array, batch_size):
# Calculate the number of rows to keep
length_to_keep = len(array) // batch_size * batch_size
# Trim array
array = array[:length_to_keep]
return array
from sklearn.model_selection import train_test_split
def create_processed_sets(data, number_inputs=40, number_of_static_inputs = 0, num_prev_index = 16, main_label_index = -6, history_length=10, b_remove_artefacts=True, b_make_sequence=False, batch_size=1024, b_generate_test_data=False, window_future_steps = 0, b_one_hot=True):
"""
Creates processed sets of training and testing data for a given dataset.
Args:
data (array-like): The input dataset.
number_inputs (int, optional): The number of input features. Defaults to 40.
number_of_static_inputs (int, optional): The number of static input features. Defaults to 0.
num_prev_index (int, optional): The number of previous indices to consider. Defaults to 16.
main_label_index (int, optional): The index of the main label. Defaults to -6.
history_length (int, optional): The history length for creating sequential data. Defaults to 10.
b_remove_artefacts (bool, optional): If True, removes artefacts from the data. Defaults to True.
b_make_sequence (bool, optional): If True, returns data as a sequence. If False, returns 2D array data.
Defaults to False.
batch_size (int, optional): The batch size for splitting the data into batches. Defaults to 1024.
b_generate_test_data (bool, optional): If True, generates test data. Defaults to False.
window_future_steps (int, optional): The number of future steps to include in each combined row.
Defaults to 0.
b_one_hot (bool, optional): If True, applies one-hot encoding to the labels. Defaults to True.
Returns:
tuple: The processed sets of training and testing data (x_train, y_train, x_test, y_test).
"""
# Get unique IDs, assuming IDs are in the first column
unique_ids = np.unique(data[:, 1])
# Shuffle these IDs and perform the split
train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=42) # Adjust test_size as needed
# Create the train and test arrays
train_set = data[np.isin(data[:, 1], train_ids)]
test_set = data[np.isin(data[:, 1], test_ids)]
train_set = trim_to_batch_size(train_set, batch_size)
test_set = trim_to_batch_size(test_set, batch_size)
np.savetxt('value_vector_'+station+'_'+sign+'.txt', np.roll( test_set[:,1],-history_length+1), fmt='%d')
x_train, y_train = process_data(train_set, data, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = num_prev_index, main_label_index = main_label_index, history_length=history_length, b_make_sequence=b_make_sequence, batch_size=batch_size, b_generate_test_data=b_generate_test_data, window_future_steps=window_future_steps, b_one_hot=b_one_hot, b_remove_artefacts=b_remove_artefacts)
x_test, y_test = process_data(test_set, data, number_inputs=number_of_inputs, num_prev_index = num_prev_index, number_of_static_inputs=number_of_static_inputs, main_label_index = main_label_index, history_length=history_length, b_make_sequence=b_make_sequence, batch_size=batch_size, b_generate_test_data=b_generate_test_data, window_future_steps=window_future_steps, b_one_hot=b_one_hot, b_remove_artefacts=b_remove_artefacts)
return x_train, y_train, x_test, y_test
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Bidirectional, LSTM
import numpy as np
from tensorflow.keras import backend as K
def define_model(num_neurons, history_length, input_size, number_of_outputs):
"""
Defines a LSTM model for sequence prediction.
Args:
num_neurons (int): The number of neurons in the LSTM layer.
history_length (int): The history length of the input sequences.
input_size (int): The number of features in each input sequence.
number_of_outputs (int): The number of output classes.
Returns:
Model: The defined LSTM model.
"""
lstm_layer = keras.layers.LSTM(num_neurons, input_shape=(history_length, input_size))
model = keras.models.Sequential(
[
lstm_layer,
keras.layers.BatchNormalization(),
keras.layers.Dropout(0.05),
keras.layers.Dense(number_of_outputs, activation= 'sigmoid'),
]
)
model.summary()
return model
modelname = "model_2048_16_8_True_Ta_54"
x = modelname.split("_")
batch_size = int(x[1])
history_length = int(x[2])
NumberOfNeurons = int(x[3])
boolean = bool(x[4])
sign = x[5]
number_of_static_inputs=0
stations = ['ALL', 'OP', 'ICU']
station = stations[2]
if(boolean):
window_future_steps = history_length
else:
window_future_steps = 0
number_of_inputs = 3
number_of_outputs = 1
df_sign = df.loc[df['Vitalsign'] == sign]
data_sign = create_dataset(df_sign)
x_train, y_train, X_test, y_test = create_processed_sets(data_sign, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = 0, main_label_index = -1, history_length=history_length, b_make_sequence=True, batch_size=batch_size, b_generate_test_data=False, window_future_steps=window_future_steps, b_one_hot=False, b_remove_artefacts=False)
model = define_model(NumberOfNeurons, history_length + window_future_steps-1, x_train.shape[3], number_of_outputs)
optim = tf.keras.optimizers.legacy.Adam()
model.compile(
optimizer=optim ,
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=["accuracy", tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])
model.load_weights(modelname)
Model: "sequential_7" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm_7 (LSTM) (None, 8) 480 batch_normalization_7 (Batc (None, 8) 32 hNormalization) dropout_7 (Dropout) (None, 8) 0 dense_7 (Dense) (None, 1) 9 ================================================================= Total params: 521 Trainable params: 505 Non-trainable params: 16 _________________________________________________________________
2023-08-01 15:55:47.871782: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]] 2023-08-01 15:55:47.873461: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32 [[{{node gradients/split_grad/concat/split/split_dim}}]] 2023-08-01 15:55:47.874258: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_1_grad/concat/split_1/split_dim' with dtype int32 [[{{node gradients/split_1_grad/concat/split_1/split_dim}}]]
<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x14b635fb5690>
'''
ls_sign = ['Ta', 'CO2']
stations = ['ALL', 'OP', 'ICU']
station = stations[0]
number_of_outputs = 1
number_of_inputs = 3 #4 for multiple features
number_of_static_inputs = 0 #1 for multiple features
b_LSTM = True
hist_lengths = [8, 16]
batch_sizes = [2048]
truefalse = [True]
number_of_neurons = [8, 16]
loss_history = []
for sign in ls_sign:
for boolean in truefalse:
for batch_size in batch_sizes:
for hist in hist_lengths:
for neurons in number_of_neurons:
best_loss = 10000000000
best_acc = 0
best_sens = 0
best_spez = 0
best_auc = 0
additional_steps = 3
batch_size = batch_size
history_length = hist
if(boolean):
window_future_steps = history_length
else:
window_future_steps = 0
df_sign = df.loc[df['Vitalsign'] == sign]
data_sign = create_dataset(df_sign)
x_train, y_train, x_test, y_test = create_processed_sets(data_sign, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = 0, main_label_index = -1, history_length=history_length, b_make_sequence=True, batch_size=batch_size, b_generate_test_data=False, window_future_steps=window_future_steps, b_one_hot=False, b_remove_artefacts=False)
model = define_model(neurons, history_length + window_future_steps-1, x_train.shape[3], number_of_outputs)
optim = tf.keras.optimizers.Adam(learning_rate=0.00005)
model.compile(
optimizer=optim ,
loss=tf.keras.losses.BinaryCrossentropy(),
metrics=["accuracy", tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])
for episode in range(100):
name = "model_"+str(batch_size)+"_"+str(history_length)+"_"+str(neurons)+"_"+str(boolean)+"_"+sign+"_"+str(episode)
average_val_loss = 0
average_val_acc = 0
average_spez = 0
average_sens = 0
average_auc = 0
no_spez_counter = 0
no_sens_counter = 0
no_auc_counter = 0
num_of_steps = x_train.shape[0]
for i in range(x_train.shape[0]):
#model.fit(x_train[i,:,:,:], y_train[i,:,:])
model.fit(x_train[i,:,:,:], y_train[i,:])
print('\n')
print('\n')
print('Validation: ')
print('--------------------------------------------------')
print('\n')
num_of_steps = x_test.shape[0]
for i in range(num_of_steps):
#loss, acc, spez, sens, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:,:])
loss, acc, tp, tn, fp, fn, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:])
average_val_loss += loss
average_val_acc += acc
if((tp + fn)== 0 or (tp + fp)== 0 or (tn + fp)== 0 or (tn + fn)== 0):
no_auc_counter += 1
else:
average_auc += auc
if((tp + fn)== 0):
no_sens_counter += 1
else:
average_sens += (tp / (tp + fn))
if((tn + fp)== 0):
no_spez_counter += 1
else:
average_spez += (tn / (tn + fp))
print('\n')
print('\n')
print('New best results: ')
print('--------------------------------------------------')
print('\n')
if(average_val_loss/num_of_steps>best_loss and average_val_acc/num_of_steps<best_acc):
additional_steps -= 1
if(additional_steps==0):
break
if(average_val_loss/num_of_steps<best_loss):
best_loss = average_val_loss/num_of_steps
print('New best loss: ', best_loss)
additional_steps = 10
if(average_val_acc/num_of_steps>best_acc):
best_acc = average_val_acc/num_of_steps
print('New best accuarcy: ', best_acc)
additional_steps = 10
if(average_sens/(num_of_steps-no_sens_counter)>best_sens):
best_sens = average_sens/(num_of_steps-no_sens_counter)
print('New best Sensitivity: ', best_sens)
if(average_spez/(num_of_steps-no_spez_counter)>best_spez):
best_spez = average_spez/(num_of_steps-no_spez_counter)
print('New best Specificity ', best_spez)
if(average_auc/(num_of_steps-no_auc_counter+0.000001)>best_auc):
best_auc = average_auc/(num_of_steps-no_auc_counter+0.000001)
print('New best AOC: ', best_auc)
additional_steps = 10
loss_history.append([average_val_loss/num_of_steps, average_val_acc/num_of_steps, average_spez/(num_of_steps-no_spez_counter), average_sens/(num_of_steps-no_sens_counter), average_auc/(num_of_steps-no_auc_counter+0.000001), name])
with open("loss_history", 'w') as file:
for item in loss_history:
for word in item:
file.write(str(word) + '\t')
file.write('\n')
model.save_weights(name)
print('\n')
print('Number of tries left:', additional_steps)
print('--------------------------------------------------')
print('\n')
'''
'\n\nls_sign = [\'Ta\', \'CO2\']\nstations = [\'ALL\', \'OP\', \'ICU\']\nstation = stations[0]\nnumber_of_outputs = 1\nnumber_of_inputs = 3 #4 for multiple features\nnumber_of_static_inputs = 0 #1 for multiple features\nb_LSTM = True\nhist_lengths = [8, 16]\nbatch_sizes = [2048]\ntruefalse = [True]\nnumber_of_neurons = [8, 16]\nloss_history = []\nfor sign in ls_sign:\n for boolean in truefalse:\n for batch_size in batch_sizes:\n for hist in hist_lengths:\n for neurons in number_of_neurons:\n best_loss = 10000000000\n best_acc = 0\n best_sens = 0\n best_spez = 0\n best_auc = 0\n additional_steps = 3\n batch_size = batch_size\n history_length = hist\n if(boolean):\n window_future_steps = history_length \n else:\n window_future_steps = 0\n\n df_sign = df.loc[df[\'Vitalsign\'] == sign]\n data_sign = create_dataset(df_sign)\n x_train, y_train, x_test, y_test = create_processed_sets(data_sign, number_inputs=number_of_inputs, number_of_static_inputs=number_of_static_inputs, num_prev_index = 0, main_label_index = -1, history_length=history_length, b_make_sequence=True, batch_size=batch_size, b_generate_test_data=False, window_future_steps=window_future_steps, b_one_hot=False, b_remove_artefacts=False)\n\n model = define_model(neurons, history_length + window_future_steps-1, x_train.shape[3], number_of_outputs) \n optim = tf.keras.optimizers.Adam(learning_rate=0.00005)\n model.compile(\n optimizer=optim ,\n loss=tf.keras.losses.BinaryCrossentropy(),\n metrics=["accuracy", tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])\n\n \n\n for episode in range(100):\n name = "model_"+str(batch_size)+"_"+str(history_length)+"_"+str(neurons)+"_"+str(boolean)+"_"+sign+"_"+str(episode)\n average_val_loss = 0\n average_val_acc = 0\n average_spez = 0\n average_sens = 0\n average_auc = 0\n no_spez_counter = 0\n no_sens_counter = 0\n no_auc_counter = 0\n num_of_steps = x_train.shape[0]\n for i in range(x_train.shape[0]):\n #model.fit(x_train[i,:,:,:], y_train[i,:,:])\n model.fit(x_train[i,:,:,:], y_train[i,:])\n print(\'\n\')\n print(\'\n\')\n print(\'Validation: \')\n print(\'--------------------------------------------------\')\n print(\'\n\')\n num_of_steps = x_test.shape[0]\n for i in range(num_of_steps):\n\n #loss, acc, spez, sens, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:,:])\n loss, acc, tp, tn, fp, fn, auc = model.evaluate(x_test[i,:,:,:], y_test[i,:])\n average_val_loss += loss\n average_val_acc += acc\n\n\n if((tp + fn)== 0 or (tp + fp)== 0 or (tn + fp)== 0 or (tn + fn)== 0):\n no_auc_counter += 1 \n else:\n average_auc += auc\n\n if((tp + fn)== 0):\n no_sens_counter += 1\n else:\n average_sens += (tp / (tp + fn)) \n\n if((tn + fp)== 0):\n no_spez_counter += 1\n else: \n average_spez += (tn / (tn + fp))\n print(\'\n\')\n print(\'\n\')\n print(\'New best results: \')\n print(\'--------------------------------------------------\')\n print(\'\n\')\n \n\n if(average_val_loss/num_of_steps>best_loss and average_val_acc/num_of_steps<best_acc):\n additional_steps -= 1\n if(additional_steps==0):\n break\n if(average_val_loss/num_of_steps<best_loss):\n best_loss = average_val_loss/num_of_steps\n print(\'New best loss: \', best_loss)\n additional_steps = 10\n if(average_val_acc/num_of_steps>best_acc):\n best_acc = average_val_acc/num_of_steps\n print(\'New best accuarcy: \', best_acc)\n additional_steps = 10\n if(average_sens/(num_of_steps-no_sens_counter)>best_sens):\n best_sens = average_sens/(num_of_steps-no_sens_counter)\n print(\'New best Sensitivity: \', best_sens)\n if(average_spez/(num_of_steps-no_spez_counter)>best_spez):\n best_spez = average_spez/(num_of_steps-no_spez_counter)\n print(\'New best Specificity \', best_spez)\n if(average_auc/(num_of_steps-no_auc_counter+0.000001)>best_auc):\n best_auc = average_auc/(num_of_steps-no_auc_counter+0.000001)\n print(\'New best AOC: \', best_auc)\n additional_steps = 10\n loss_history.append([average_val_loss/num_of_steps, average_val_acc/num_of_steps, average_spez/(num_of_steps-no_spez_counter), average_sens/(num_of_steps-no_sens_counter), average_auc/(num_of_steps-no_auc_counter+0.000001), name])\n with open("loss_history", \'w\') as file:\n for item in loss_history: \n for word in item:\n file.write(str(word) + \'\t\')\n file.write(\'\n\')\n model.save_weights(name)\n print(\'\n\')\n print(\'Number of tries left:\', additional_steps)\n print(\'--------------------------------------------------\')\n print(\'\n\')\n'
AdmissionDataIndices = []
admission_id = 1
for i, ele in enumerate(data_sign[:,0]):
if(ele != admission_id):
admission_id = ele
AdmissionDataIndices.append(i)
try:
X_test = x_test
except:
pass
try:
X_train = x_train
except:
pass
neg_neg = 0
neg_pos = 0
pos_neg = 0
pos_pos = 0
counter = 0
X_train_len = X_train.shape[0]*X_train.shape[1]
for batch_index in range(X_test.shape[0]):
label_curve = []
predicted_curve = []
main_curve = []
prob_pos = []
preds = model.predict(X_test[batch_index,:,:,:])
for i in range(X_test.shape[1]):
patient_id = df_sign.iloc[i+X_train_len+batch_index*batch_size,1]
if(get_group(patient_id, icu)==station or station == 'ALL'):
pred = preds[i]
label = y_test[batch_index,i]
main_curve.append(X_test[batch_index, i, -1 ,0])
label_curve.append(label)
if(label>0):
counter+=1
if(label==0):
if(pred<0.5):
neg_neg+=1
else:
neg_pos+=1
else:
if(pred<0.5):
pos_neg+=1
else:
pos_pos+=1
prob_pos.append(preds.flatten())
predicted_curve.append(pred)
sum_neg = neg_neg + neg_pos
sum_pos = pos_neg + pos_pos
print(neg_neg, neg_pos, pos_neg, pos_pos)
print(neg_neg/sum_neg, pos_pos/sum_pos)
print((neg_neg+ pos_pos) / ((sum_neg+ sum_pos)))
64/64 [==============================] - 0s 1ms/step
2023-08-01 15:55:48.154635: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]] 2023-08-01 15:55:48.156048: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32 [[{{node gradients/split_grad/concat/split/split_dim}}]] 2023-08-01 15:55:48.156960: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_1_grad/concat/split_1/split_dim' with dtype int32 [[{{node gradients/split_1_grad/concat/split_1/split_dim}}]]
1926 0 7 11 1.0 0.6111111111111112 0.9963991769547325
X_test.shape
(1, 2048, 31, 6)
y_test = y_test.flatten()
y_test.shape
(2048,)
prob_pos = []
for batch_index in range(X_test.shape[0]):
prob_pos.append(model.predict(X_test[batch_index,:,:,:]))
prob_pos_train = []
for batch_index in range(x_train.shape[0]):
prob_pos_train.append(model.predict(x_train[batch_index,:,:,:]))
64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step 64/64 [==============================] - 0s 1ms/step
prob_pos = np.array(prob_pos).flatten()
prob_pos_train = np.array(prob_pos_train).flatten()
real_values = np.loadtxt('value_vector_'+station+'_'+sign+'.txt', dtype=int)
import matplotlib.pyplot as plt
import numpy as np
# Create a color array based on TP, FP, TN, FN
colors = []
for i in range(len(y_test)):
label = y_test[i]
pred = prob_pos[i]
if label > 0 and pred >= 0.5: # TP: Actual positive and predicted as positive
colors.append('green')
elif label == 0 and pred < 0.5: # TN: Actual negative and predicted as negative
colors.append('blue')
elif label == 0 and pred >= 0.5: # FP: Actual negative but predicted as positive
colors.append('red')
else: # FN: Actual positive but predicted as negative
colors.append('orange')
# Convert colors to a numpy array
colors = np.array(colors)
# Create a scatter plot of main_curve with color-coding based on prediction correctness
plt.scatter(range(len(real_values)), real_values, c=colors, s=15) # s parameter controls the size of dots
plt.plot(range(len(real_values)), real_values, color='black', linewidth=0.5) # plot the line
# Title and labels
plt.title('Example Signal with Prediction Outcomes')
plt.xlabel('Time Step (index)')
plt.ylabel('Example Signal')
# Setting the x limit
plt.xlim([1200, 1600])
# Legend
plt.scatter([], [], color='green', label='True Positive', s=15)
plt.scatter([], [], color='blue', label='True Negative', s=15)
plt.scatter([], [], color='red', label='False Positive', s=15)
plt.scatter([], [], color='orange', label='False Negative', s=15)
plt.legend()
plt.savefig('Example_plot_'+station+'_'+sign+'.png')
plt.show()
from sklearn.linear_model import LogisticRegression
raw_probabilities = prob_pos_train.reshape(-1, 1)
# Fit the Logistic Regression to calibrate the probabilities
lr = LogisticRegression().fit(raw_probabilities, y_train.flatten())
calibrated_probabilities = lr.predict_proba(prob_pos.reshape(-1, 1)) # Calibrate probabilities
prob_pos = calibrated_probabilities
prob_pos = prob_pos[:,1]
print(prob_pos)
[0.00331583 0.00331507 0.00331439 ... 0.00332162 0.00332071 0.00331997]
np.savetxt('probability_vector_'+station+'_'+sign+'.txt', prob_pos, fmt='%f')
np.savetxt('label_vector_'+station+'_'+sign+'.txt', y_test, fmt='%d')
y_pred = (prob_pos > 0.5).astype(int)
import seaborn as sns
sns.set_style("white")
from sklearn.calibration import calibration_curve
fraction_of_positives, mean_predicted_value = calibration_curve(y_test, prob_pos, n_bins=10)
from sklearn.metrics import roc_curve, roc_auc_score
# Calculate the ROC curve points
fpr, tpr, thresholds = roc_curve(y_test, prob_pos)
# Calculate the AUROC score
auroc = roc_auc_score(y_test, prob_pos)
# Figure 1 for Calibration curve and Histogram
fig1 = plt.figure(figsize=(10, 8)) # Set the overall figure size
gs1 = plt.GridSpec(2, 1, height_ratios=[1, 1]) # Set height ratios to 1:1 for equal vertical size
# Calibration curve
ax1 = plt.subplot(gs1[0])
ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
ax1.plot(mean_predicted_value, fraction_of_positives, "s-", label="%s" % 'LSTM-Classifer', color='blue')
ax1.set_xlim([0, 1]) # Set x range to 0 to 1
ax1.set_ylim([0, 1]) # Set y range to 0 to 1
ax1.set_ylabel("Fraction of positives")
ax1.set_title('Calibration plot of LSTM classifer for '+sign+ ' ('+station+')')
ax1.legend(loc="upper left")
ax1.xaxis.tick_top() # Move x-axis labels to top
ax1.xaxis.set_label_position('top') # Move x-axis label to top
# Position legend of plot 1 on x-axis
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles, labels, ncol=2)
# Histogram as an "inset" plot
ax2 = plt.subplot(gs1[1], sharex=ax1) # Sharing x-axis with first plot
hist, bins = np.histogram(prob_pos, bins=10, range=(0, 1))
width = (bins[1] - bins[0]) * 0.8
centers = (bins[:-1] + bins[1:]) / 2
ax2.bar(centers, hist, align='center', width=width, alpha=0.5, color='green', label='Histogram')
ax2.set_xlabel("Predicted probability") # Add x-axis label
ax2.set_ylabel("Number of cases")
ax2.yaxis.set_major_locator(plt.MaxNLocator(5))
plt.subplots_adjust(hspace=0)
plt.savefig('Calibration_Histogram_'+station+'_'+sign+'.png')
plt.show()
import numpy as np
from sklearn.metrics import roc_auc_score, recall_score, precision_score, accuracy_score
from sklearn.utils import resample
# Define the number of bootstrap samples
n_bootstrap = 2000
# Create empty lists to store bootstrapped metrics
sensitivity_values = []
specificity_values = []
ppv_values = []
npv_values = []
auroc_values = []
accuracy_values = []
# Perform bootstrap resampling
for _ in range(n_bootstrap):
# Sample with replacement from the predictions
indices = resample(range(len(y_pred)), replace=True)
bootstrap_y_test = y_test[indices]
bootstrap_y_pred = y_pred[indices]
bootstrap_prob_pos = prob_pos[indices]
# Calculate metrics for the bootstrap sample
sensitivity = recall_score(bootstrap_y_test, bootstrap_y_pred)
specificity = recall_score(bootstrap_y_test, bootstrap_y_pred, pos_label=0)
ppv = precision_score(bootstrap_y_test, bootstrap_y_pred)
npv = precision_score(bootstrap_y_test, bootstrap_y_pred, pos_label=0)
auroc = roc_auc_score(bootstrap_y_test, bootstrap_prob_pos)
accuracy = accuracy_score(bootstrap_y_test, bootstrap_y_pred)
# Append the metric values to the lists
sensitivity_values.append(sensitivity)
specificity_values.append(specificity)
ppv_values.append(ppv)
npv_values.append(npv)
auroc_values.append(auroc)
accuracy_values.append(accuracy)
# Calculate confidence intervals
confidence_interval = 0.95
alpha = (1 - confidence_interval) / 2
lower_percentile = alpha * 100
upper_percentile = (1 - alpha) * 100
# Calculate confidence intervals for each metric
sensitivity_ci = np.percentile(sensitivity_values, [lower_percentile, upper_percentile])
specificity_ci = np.percentile(specificity_values, [lower_percentile, upper_percentile])
ppv_ci = np.percentile(ppv_values, [lower_percentile, upper_percentile])
npv_ci = np.percentile(npv_values, [lower_percentile, upper_percentile])
auroc_ci = np.percentile(auroc_values, [lower_percentile, upper_percentile])
accuracy_ci = np.percentile(accuracy_values, [lower_percentile, upper_percentile])
with open("metrics_"+station+"_"+sign+".txt", "w") as file:
file.write("Accuracy: {:.3f} ({:.3f}-{:.3f})\n".format(np.mean(accuracy_values), accuracy_ci[0], accuracy_ci[1]))
file.write("Sensitivity: {:.3f} ({:.3f}-{:.3f})\n".format(sensitivity, sensitivity_ci[0], sensitivity_ci[1]))
file.write("Specificity: {:.3f} ({:.3f}-{:.3f})\n".format(specificity, specificity_ci[0], specificity_ci[1]))
file.write("Positive Predictive Value (PPV): {:.3f} ({:.3f}-{:.3f})\n".format(ppv, ppv_ci[0], ppv_ci[1]))
file.write("Negative Predictive Value (NPV): {:.3f} ({:.3f}-{:.3f})\n".format(npv, npv_ci[0], npv_ci[1]))
file.write("Area Under the ROC Curve (AUROC): {:.3f} ({:.3f}-{:.3f})".format(auroc, auroc_ci[0], auroc_ci[1]))
from sklearn.metrics import roc_curve, auc
from numpy import interp
n_bootstraps = 2000
rng_seed = 42 # control reproducibility
bootstrapped_tprs = []
bootstrapped_auc = []
base_fpr = np.linspace(0, 1, 101) # Fix the FPR values for interpolation
rng = np.random.RandomState(rng_seed)
for i in range(n_bootstraps):
# bootstrap by sampling with replacement on the prediction indices
indices = rng.randint(0, len(prob_pos), len(prob_pos))
if len(np.unique(y_test[indices])) < 2:
# We need at least one positive and one negative sample for ROC AUC
# to be defined: reject the sample
continue
fpr_boot, tpr_boot, _ = roc_curve(y_test[indices], prob_pos[indices])
bootstrapped_auc.append(auc(fpr_boot, tpr_boot))
# Interpolate TPR for the fixed FPR values
tpr_boot_interp = interp(base_fpr, fpr_boot, tpr_boot)
bootstrapped_tprs.append(tpr_boot_interp)
# Convert list to a 2D array
bootstrapped_tprs = np.array(bootstrapped_tprs)
# Compute the lower and upper bound of the ROC curve
tprs_lower = np.percentile(bootstrapped_tprs, 2.5, axis=0)
tprs_upper = np.percentile(bootstrapped_tprs, 97.5, axis=0)
# Compute the lower and upper bound of the ROC AUC
auc_lower = np.percentile(bootstrapped_auc, 2.5)
auc_upper = np.percentile(bootstrapped_auc, 97.5)
print("Confidence interval for the ROC curve: [{:0.3f} - {:0.3}]".format(auc_lower, auc_upper))
Confidence interval for the ROC curve: [0.986 - 0.999]
fig2 = plt.figure(figsize=(10, 6))
specificity = 1 - base_fpr # Calculate Specificity (1 - False Positive Rate)
ax3 = plt.subplot(1, 1, 1)
ax3.plot([1, 0], [0, 1], 'k--') # Invert x-axis
ax3.plot(specificity, np.median(bootstrapped_tprs, axis=0), label=r'ROC (AUC = %0.2f $\pm$ %0.2f)' % (auroc, (auc_upper-auc_lower)/2), color='blue')
ax3.fill_between(specificity, tprs_lower, tprs_upper, color='grey', alpha=0.2)
ax3.set_xlabel('Specificity (True Negative Rate)')
ax3.set_ylabel('Sensitivity (True Positive Rate)')
ax3.set_title('Receiver Operating Characteristic for '+sign+ ' ('+station+')')
ax3.legend(loc='lower right')
ax3.set_xlim([1, 0]) # Invert x-axis limits
ax3.set_ylim([0, 1])
ax3.set_xticks([1,0.8,0.6,0.4,0.2,0]) # Set x-axis tick locations
ax3.set_xticklabels([1,0.8,0.6,0.4,0.2,0]) # Set x-axis tick labels
plt.tight_layout()
plt.savefig('ROC_Curve_'+station+'_'+sign+'.png')
plt.show()