Commit 0c9ca4d0 authored by slavenc's avatar slavenc

added data augmentation

parent 2bbd5085
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 24 13:30:00 2019
@author: made_
"""
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from scale import scale
from outliers import outliers
from impute import impute
from perturb import perturb
def augment():
# import data
X_train = pd.read_csv('files/X_train.csv').values[:,1:]
y_train = pd.read_csv('files/y_train.csv').values[:,1 ]
X_test = pd.read_csv('files/X_test.csv').values[:,1:]
z,s = np.shape(X_train)
z_,s_ = np.shape(X_test)
# check if balanced
print('Size of X:', np.shape(X_train))
print('Classes in y:',np.unique(y_train))
iter_i = 0
print('Class Balance')
while iter_i != len(np.unique(y_train)):
print(' Data for Class %s: ' %iter_i, len(y_train[np.where(y_train==iter_i)]))
iter_i += 1
# check for presence of nans
isnan_train = np.zeros(s, dtype=int)
isnan_test = np.zeros(s, dtype=int)
for i in range(s):
isnan_train[i] = np.sum(np.isnan(X_train[:,i]))
isnan_test[i] = np.sum(np.isnan(X_test[:,i]))
# modify to keep only those features with less than #n nans in X_train
X_train_mod = X_train[:,isnan_train < 100]
X_test_mod = X_test[:,isnan_train < 100] # apply same reduction to testset as in trainset
# use simple procedure to replace nans
X_train_mod = impute(X_train_mod)
X_test_mod = impute(X_test_mod)
# check for outliers - causes problems when writing y_pred to csv
# train_inliers = outliers(X_train_mod)
# test_inliers = outliers(X_test_mod)
# X_train_mod = X_train_mod[train_inliers, :]
# X_test_mod = X_test_mod[test_inliers, :]
# y_train = y_train[train_inliers]
# scale data after the previous procedure
# X_train_mod, X_test_mod = scale(X_train_mod, X_test_mod)
# data augmentation
# TODO : augment
X_train_scaled, X_test_scaled = scale(X_train_mod, X_test_mod)
X_train_ones = X_train_mod + np.ones_like(X_train_mod)
X_test_ones = X_test_mod + np.ones_like(X_test_mod)
X_train_pert = perturb(X_train_mod)
X_test_pert = perturb(X_test_mod)
X_train_aug = np.concatenate((X_train_mod, X_train_scaled, X_train_ones, \
X_train_pert), axis=1)
X_test_aug = np.concatenate((X_test_mod, X_test_scaled, X_test_ones, \
X_test_pert), axis=1)
# saving of modified data for quicker usage
pd.DataFrame(X_train_aug).to_csv('files/X_train_aug.csv')
pd.DataFrame(X_test_aug).to_csv('files/X_test_aug.csv')
# pd.DataFrame(y_train).to_csv('files/y_train_mod.csv')
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 24 15:26:55 2019
@author: made_
"""
import numpy as np
# perturb data by adding a bit of random Gaussian noise for every feature
def perturb(X, mean=0, std=1):
z, s = np.shape(X)
for i in range(s):
noise = np.random.normal(mean, std, z) # create random noise for i-th feature
X[:,i] += noise
return X
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 18 17:10:53 2019
@author: made_
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.utils import compute_class_weight
from run import run
# get data
#X_train = pd.read_csv('files/X_train_aug.csv').values[:,1:]
#y_train = pd.read_csv('files/y_train_aug.csv').values[:,1 ]
#X_test = pd.read_csv('files/X_test_aug.csv').values[:,1:]
PROTOTYPING = True
# do a parameter search
n_trees = np.array([40,50,60,70,80])
boot_opt = [False, True]
random_state = 1000
weights = [None, 'balanced', 'balanced_subsample']
param_grid = [{'n_estimators' : n_trees,
'bootstrap' : boot_opt,
'oob_score' : boot_opt,
'class_weight' : [None, 'balanced', 'balanced_subsample']}]
# run ExtraTreesClassifier and RandomForestClassifier separately
for i in range(len(n_trees)):
for j in range(len(boot_opt)):
for k in range(len(weights)):
print('Estimators =', n_trees[i])
print('Bootstrap =', boot_opt[j])
print('Weights are ', weights[k])
run(classifier=RandomForestClassifier(n_estimators=n_trees[i],
bootstrap=boot_opt[j], oob_score=boot_opt[j],
class_weight=weights[k],
random_state=random_state), proto=PROTOTYPING)
print('')
## GridSearchCV
#clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring = 'accuracy')
#for i in range(len(n_trees)):
# print('Estimators =', n_trees[i])
# clf.fit(X_train, y_train)
# print('')
#print(clf.best_params_)
####
# Do the final prediction
PROTOTYPING = False
# those are the best params from the parameter search
run(classifier=RandomForestClassifier(n_estimators=100, bootstrap=True, oob_score=True,
class_weight='balanced_subsample'), proto=PROTOTYPING)
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from modify import modify
from augment import augment
from scale import scale
from sampling import sampling
from preprocess import preprocess
# process data: creates modified train and testsets for faster import and use
# modify() # creates modified dataset
# augment() # creates augmented dataset
# for unbalanced data: OvO, OvR or None
def run(classifier, proto = False):
## Reading input
X_train = pd.read_csv('files/X_train_aug.csv').drop('Unnamed: 0', axis=1)
X_test = pd.read_csv('files/X_test_aug.csv').drop('Unnamed: 0', axis=1)
y_train = pd.read_csv('files/y_train.csv').drop('id', axis=1)
X_columns = X_train.columns.values
y_columns = y_train.columns.values
## Splitting for validation
if proto:
X_train, X_test, y_train, y_test = preprocess(X_train, y_train)
print("Finished splitting");
# sampling adds one row more to X_train for some reason... also make generic
# this part is probably not needed for rfcs as it does it itself sometimes
X_train, y_train = sampling(X_train, y_train, X_columns, y_columns)
print("Finished sampling");
print(np.shape(X_train))
print(np.shape(X_test))
print(np.shape(y_train))
## Trainining
OvRClassifier = OneVsOneClassifier(classifier)
OvRClassifier.fit(X_train, y_train)
print("Finished training");
## Predicting
y_predict = OvRClassifier.predict(X_test)
if proto:
print(f1_score(y_test, y_predict, average='weighted'))
## Writing output
if not proto:
output = pd.read_csv('files/sample.csv')
for i in range(output.shape[0]):
output.iat[i, 1] = y_predict[i]
output.to_csv(f"outputs/{OvRClassifier.__class__.__name__}.{classifier.__class__.__name__}.csv", index=False)
print("Finished predicting");
from sklearn.svm import NuSVC
from run import run
PROTOTYPING = False
run(classifier=NuSVC(nu=0.5, gamma='scale'), proto=PROTOTYPING)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment