Commit fa8cbf23 authored by slavenc's avatar slavenc

added project 3 files

parent 0c9ca4d0
files
files/
output/
__pycache__
......@@ -64,7 +64,6 @@ def augment():
# data augmentation
# TODO : augment
X_train_scaled, X_test_scaled = scale(X_train_mod, X_test_mod)
X_train_ones = X_train_mod + np.ones_like(X_train_mod)
X_test_ones = X_test_mod + np.ones_like(X_test_mod)
......
from sklearn.impute import SimpleImputer
import numpy as np
def impute(X, strategy = 'mean'):
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X)
X_mod = imp_mean.transform(X)
return X_mod
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 25 19:22:20 2019
@author: made_
"""
import numpy as np
import pandas as pd
import keras
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dropout, LSTM, Dense
# libraries when working in Colaboratory
#from google.colab import files
# get data
X_train = pd.read_csv('files/X_train_aug.csv').values[:,1:]
y_train = pd.read_csv('files/y_train.csv').values[:,1 ]
X_test = pd.read_csv('files/X_test_aug.csv').values[:,1:]
z,s = np.shape(X_train)
scaler = StandardScaler()
scaler.fit(X_train)
X_transformed = scaler.transform(X_train)
Xtest_transformed = scaler.transform(X_test)
# define the model architecture
lahead = s
ann = Sequential()
ann.add(LSTM(1024, input_shape = (lahead,1), activation = 'relu',
return_sequences = True))
ann.add(Dropout(0.2))
ann.add(LSTM(512, activation = 'relu'))
ann.add(Dropout(0.25))
#ann.add(LSTM(512, activation = 'relu'))
#ann.add(Dropout(0.25))
#
#ann.add(LSTM(256, activation = 'relu'))
#ann.add(Dropout(0.25))
ann.add(Dense(128, activation = 'relu'))
ann.add(Dropout(0.25))
ann.add(Dense(4, activation='softmax'))
opt = keras.optimizers.Adam(lr = 1e-3, decay = 1e-5)
ann.compile(loss = 'sparse_categorical_crossentropy',
optimizer = opt,
metrics = ['accuracy'])
# reshape to fulfill LSTM shape requirements
reshape_1 = lambda x: x.reshape((x.shape[0], x.shape[1], 1)) # reshape x trainset
reshape_2 = lambda y: y.reshape((y.shape[0], 1)) # reshape y
X_reshaped = reshape_1(X_train)
Xtest_reshaped = reshape_1(Xtest_transformed)
y_reshaped = reshape_2(y_train)
# fit and predict (later add class weights and also optimize them)
ann.fit(X_reshaped, y_reshaped, epochs = 1000, batch_size = 500)
pred_ann = ann.predict_classes(Xtest_reshaped)
print(pred_ann)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 18 15:53:27 2019
@author: made_
"""
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from scale import scale
from outliers import outliers
from impute import impute
def modify():
# import data
X_train = pd.read_csv('files/X_train.csv').values[:,1:]
y_train = pd.read_csv('files/y_train.csv').values[:,1 ]
X_test = pd.read_csv('files/X_test.csv').values[:,1:]
z,s = np.shape(X_train)
z_,s_ = np.shape(X_test)
# check if balanced
print('Size of X:', np.shape(X_train))
print('Classes in y:',np.unique(y_train))
iter_i = 0
print('Class Balance')
while iter_i != len(np.unique(y_train)):
print(' Data for Class %s: ' %iter_i, len(y_train[np.where(y_train==iter_i)]))
iter_i += 1
# check for presence of nans
isnan_train = np.zeros(s, dtype=int)
isnan_test = np.zeros(s, dtype=int)
for i in range(s):
isnan_train[i] = np.sum(np.isnan(X_train[:,i]))
isnan_test[i] = np.sum(np.isnan(X_test[:,i]))
# modify to keep only those features with less than #n nans in X_train
X_train_mod = X_train[:,isnan_train < 300]
X_test_mod = X_test[:,isnan_train < 300] # apply same reduction to testset as in trainset
# use simple procedure to replace nans
X_train_mod = impute(X_train_mod)
X_test_mod = impute(X_test_mod)
# check for outliers - causes problems when writing y_pred to csv
# train_inliers = outliers(X_train_mod)
# test_inliers = outliers(X_test_mod)
# X_train_mod = X_train_mod[train_inliers, :]
# X_test_mod = X_test_mod[test_inliers, :]
# y_train = y_train[train_inliers]
# scale data after the previous procedure
# X_train_mod, X_test_mod = scale(X_train_mod, X_test_mod)
# saving of modified data for quicker usage
pd.DataFrame(X_train_mod).to_csv('files/X_train_mod.csv')
pd.DataFrame(X_test_mod).to_csv('files/X_test_mod.csv')
# pd.DataFrame(y_train).to_csv('files/y_train_mod.csv')
\ No newline at end of file
import numpy as np
from sklearn.ensemble import IsolationForest
# remove outliers
def outliers(X, n_trees = 100, outlier_proportion = 0.15):
outliers = IsolationForest(n_estimators=n_trees,
behaviour='new',
contamination=outlier_proportion)
outliers.fit(X)
inlier_indices = np.nonzero(outliers.predict(X) > 0)[0]
return inlier_indices
\ No newline at end of file
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def preprocess(X_train, y_train):
return train_test_split(
X_train,
y_train,
test_size=0.25,
shuffle=False,
random_state=0) # reproducable results
import pandas as pd
from sklearn.utils import resample
ALGORITHM = "undersampling" # { oversampling, undersampling, smos }
# TODO: make generic instead of hardcoded
def sampling(X_train, y_train, X_columns, y_columns):
## Sampling
X_train = pd.DataFrame(data=X_train, columns=X_columns)
y_train = pd.DataFrame(data=y_train, columns=y_columns)
X = pd.concat([y_train, X_train], axis=1)
label_0 = X[X.y==0]
label_1 = X[X.y==1]
label_2 = X[X.y==2]
if ALGORITHM == "oversampling":
label_0_upsampled = resample(label_0, n_samples=len(label_1), random_state=0)
label_2_upsampled = resample(label_2, n_samples=len(label_1), random_state=0)
upsampled = pd.concat([label_0_upsampled, label_1, label_2_upsampled]).sample(frac=1, random_state=0)
y_train = upsampled.values[:, 0]
X_train = upsampled.values[:, 1:]
elif ALGORITHM == "undersampling":
label_1_downsampled = resample(label_1, n_samples=len(label_0))
downsampled = pd.concat([label_0, label_1_downsampled, label_2]).sample(frac=1, random_state=0)
y_train = downsampled.values[:, 0]
X_train = downsampled.values[:, 1:]
return X_train, y_train
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def scale(X_train, X_test):
scaler = StandardScaler().fit(X_train)
# scale
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment