Commit 2bbd5085 authored by slavenc's avatar slavenc

add project 2 files

parent 683116b1
from sklearn.linear_model import LogisticRegression
from run import run
PROTOTYPING = True
run(classifier=LogisticRegression(solver="lbfgs"), proto=PROTOTYPING)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def preprocess(X_train, y_train):
return train_test_split(
X_train,
y_train,
test_size=0.25,
shuffle=False,
random_state=0) # reproducable results
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.utils import compute_class_weight
from run import run
# get data
X_train = pd.read_csv('files/X_train.csv').values[:,1:]
y_train = pd.read_csv('files/y_train.csv').values[:,1 ]
X_test = pd.read_csv('files/X_test.csv').values[:,1:]
PROTOTYPING = True
# do a parameter search
n_trees = np.array([128,129,130,131,132,133,134])
boot_opt = [False, True]
random_state = 1000
weights = [None, 'balanced', 'balanced_subsample']
param_grid = [{'n_estimators' : n_trees,
'bootstrap' : boot_opt,
'oob_score' : boot_opt,
'class_weight' : [None, 'balanced', 'balanced_subsample']}]
# run ExtraTreesClassifier and RandomForestClassifier separately
for i in range(len(n_trees)):
for j in range(len(boot_opt)):
for k in range(len(weights)):
print('Estimators =', n_trees[i])
print('Bootstrap =', boot_opt[j])
print('Weights are ', weights[k])
run(classifier=RandomForestClassifier(n_estimators=n_trees[i],
bootstrap=boot_opt[j], oob_score=boot_opt[j],
class_weight=weights[k],
random_state=random_state), proto=PROTOTYPING)
print('')
## GridSearchCV
#clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring = 'accuracy')
#for i in range(len(n_trees)):
# print('Estimators =', n_trees[i])
# clf.fit(X_train, y_train)
# print('')
#print(clf.best_params_)
####
# Do the final prediction
PROTOTYPING = False
# those are the best params from the parameter search
run(classifier=RandomForestClassifier(n_estimators=134, bootstrap=True, oob_score=True,
class_weight='balanced'), proto=PROTOTYPING)
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from scale import scale
from sampling import sampling
from preprocessing import preprocess
def run(classifier, proto = False):
## Reading input
X_train = pd.read_csv('files/X_train.csv').drop('id', axis=1)
X_test = pd.read_csv('files/X_test.csv').drop('id', axis=1)
y_train = pd.read_csv('files/y_train.csv').drop('id', axis=1)
X_columns = X_train.columns.values
y_columns = y_train.columns.values
## Feature scaling
X_train, X_test = scale(X_train, X_test)
## Splitting for validation
if proto:
X_train, X_test, y_train, y_test = preprocess(X_train, y_train)
print("Finished splitting");
X_train, y_train = sampling(X_train, y_train, X_columns, y_columns)
print("Finished sampling");
## Trainining
OvRClassifier = OneVsRestClassifier(classifier)
OvRClassifier.fit(X_train, y_train)
print("Finished training");
## Predicting
y_predict = OvRClassifier.predict(X_test)
if proto:
print(f1_score(y_test, y_predict))
## Writing output
if not proto:
output = pd.read_csv('files/sample.csv')
for i in range(output.shape[0]):
output.iat[i, 1] = y_predict[i]
output.to_csv(f"outputs/{OvRClassifier.__class__.__name__}.{classifier.__class__.__name__}.csv", index=False)
print("Finished predicting");
import pandas as pd
from sklearn.utils import resample
ALGORITHM = "undersampling" # { oversampling, undersampling, smos }
def sampling(X_train, y_train, X_columns, y_columns):
## Sampling
X_train = pd.DataFrame(data=X_train, columns=X_columns)
y_train = pd.DataFrame(data=y_train, columns=y_columns)
X = pd.concat([y_train, X_train], axis=1)
label_0 = X[X.y==0]
label_1 = X[X.y==1]
label_2 = X[X.y==2]
if ALGORITHM == "oversampling":
label_0_upsampled = resample(label_0, n_samples=len(label_1), random_state=0)
label_2_upsampled = resample(label_2, n_samples=len(label_1), random_state=0)
upsampled = pd.concat([label_0_upsampled, label_1, label_2_upsampled]).sample(frac=1, random_state=0)
y_train = upsampled.values[:, 0]
X_train = upsampled.values[:, 1:]
elif ALGORITHM == "undersampling":
label_1_downsampled = resample(label_1, n_samples=len(label_0))
downsampled = pd.concat([label_0, label_1_downsampled, label_2]).sample(frac=1, random_state=0)
y_train = downsampled.values[:, 0]
X_train = downsampled.values[:, 1:]
return X_train, y_train
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
def scale(X_train, X_test):
scaler = StandardScaler().fit(X_train)
# scale
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled
from sklearn.svm import NuSVC
from run import run
PROTOTYPING = False
run(classifier=NuSVC(nu=0.5, gamma='scale'), proto=PROTOTYPING)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment