Commit 683116b1 authored by slavenc's avatar slavenc

add project 1 files

parents
from model import Network
import pickle
import pandas as pd
import numpy as np
import torch
import argparse
import interpolate
parser = argparse.ArgumentParser()
parser.add_argument('--save-file',
help='Where the trained model is saved')
parser.add_argument('--cuda', action='store_true',
help='Whether to use CUDA device')
args = parser.parse_args()
with open(args.save_file, 'rb') as model_file:
net = pickle.load(model_file)
with open(f"{args.save_file}.feature_indices", 'rb') as feature_file:
feature_indices = pickle.load(feature_file)
with open(f"{args.save_file}.feature_defaults", 'rb') as defaults_file:
default_features = pickle.load(defaults_file)
X_test_raw = pd.read_csv('X_test.csv')
X_test = X_test_raw.values[:, 1:]
print(f"Original shape of X_test: {X_test.shape}")
print("Recentering X_test")
X_test = interpolate.load(X_test, default_features)
print("Selecting features from X_test")
X_test = X_test[:, feature_indices]
print(f"Final shape of X_test: {X_test.shape}")
device = 'cuda' if args.cuda else 'cpu'
X_test = torch.from_numpy(X_test).float().to(device)
net.to(device)
net.eval()
logits = net(X_test).squeeze()
print(f"Writing outputs to {args.save_file}.solution.csv")
output = pd.read_csv('sample.csv')
for i in range(output.shape[0]):
output.iat[i, 1] = logits[i]
output.to_csv(f"{args.save_file}.solution.csv", index=False)
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
def select(raw_features, labels, top_k, variance_limit):
var = np.var(raw_features, axis=0)
crazy_variance_indices = np.nonzero(var > variance_limit)[0]
k = min(top_k, raw_features.shape[1])
best_features_model = SelectKBest(score_func=f_classif, k='all')
feature_scores = best_features_model.fit(raw_features, labels).scores_
combined = list(zip([i for i in range(len(feature_scores))], feature_scores))
combined.sort(key=lambda x: -x[1])
combined = list(filter(lambda x: x[0] not in crazy_variance_indices,
combined))
final_indices = [pair[0] for pair in combined[:k]]
return final_indices
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 20 13:13:39 2019
@author: made_
"""
import math
# interpolation happens by taking the mean value of the previous
# and next non-nan value given a nan-value to approximate it.
def interpolate2(features):
for i in range(features.shape[0]-1):
for j in range(features.shape[1]):
if math.isnan(features[i][j]):
if i < features.shape[0]-2:
if not math.isnan(features[i+1][j]):
features[i][j] = 0.5*(features[i-1][j] + features[i+1][j])
# here we simply jump, better would be to check for nans until no nan is found
else: features[i][j] = 0.5*(features[i-1][j] + features[i+2][j])
else: features[i][j] = features[i-1][j] # simply take previous one
return features
\ No newline at end of file
import os
import pickle
import argparse
import interpolate
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import r2_score, mean_squared_error
# import data
X_train_raw = pd.read_csv('X_train.csv')
Y_train_raw = pd.read_csv('y_train.csv')
X_train = X_train_raw.values[:, 1:]
Y_train = Y_train_raw.values[:, 1]
original_shape = X_train.shape
parser = argparse.ArgumentParser()
parser.add_argument('--save-dir', type=str,
help='Where to save components')
parser.add_argument('--interpolation-iters', type=int,
help='How many iterations of the IterativeImputer to run')
parser.add_argument('--top-k', type=int,
help='How many features to keep')
parser.add_argument('--outlier-proportion', type=float,
help='Proportion of data that are outliers')
parser.add_argument('--folds', type=int,
help='How many folds to do CV on')
parser.add_argument('--predict', action='store_true',
help='Whether to predict')
args = parser.parse_args()
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
# interpolate features
print("Recentering features...")
from sklearn.preprocessing import StandardScaler
scaler_path = os.path.join(args.save_dir, 'scaler.pkl')
if not os.path.exists(scaler_path):
scaler = StandardScaler()
scaler.fit(X_train)
with open(scaler_path, 'wb') as output:
pickle.dump(scaler, output)
else:
with open(scaler_path, 'rb') as inp:
scaler = pickle.load(inp)
X_train = scaler.transform(X_train)
print("Interpolating features...")
imputer_path = os.path.join(args.save_dir, 'imputer.pkl')
if not os.path.exists(imputer_path):
# imp = IterativeImputer(max_iter=args.interpolation_iters,
# initial_strategy='median',
# verbose=2)
imp = SimpleImputer(strategy='median', verbose=2)
imp.fit(X_train)
with open(imputer_path, 'wb') as output:
pickle.dump(imp, output)
else:
with open(imputer_path, 'rb') as inp:
imp = pickle.load(inp)
X_train = imp.transform(X_train)
# remove useless features
print("Selecting features...")
selector_path = os.path.join(args.save_dir, 'selector.pkl')
if not os.path.exists(selector_path):
selector = SelectKBest(score_func=f_regression, k=args.top_k)
# selector = SelectFromModel(ExtraTreesClassifier(n_estimators=50),
# threshold=-np.inf,
# max_features=args.top_k)
selector.fit(X_train, Y_train)
with open(selector_path, 'wb') as output:
pickle.dump(selector, output)
else:
with open(selector_path, 'rb') as inp:
selector = pickle.load(inp)
X_train = selector.transform(X_train)
print(X_train.shape)
print("Removing outliers...")
outlier_path = os.path.join(args.save_dir, 'outlier.pkl')
if not os.path.exists(outlier_path):
outlier = IsolationForest(n_estimators=50,
behaviour='new',
contamination=args.outlier_proportion)
outlier.fit(X_train)
with open(outlier_path, 'wb') as output:
pickle.dump(outlier, output)
else:
with open(outlier_path, 'rb') as inp:
outlier = pickle.load(inp)
inlier_indices = np.nonzero(outlier.predict(X_train) > 0)[0]
X_train = X_train[inlier_indices, :]
Y_train = Y_train[inlier_indices]
#print(f"Original shape of X_train: {original_shape}")
#print(f"Final shape of X_train: {X_train.shape}")
print("Training model...")
model_path = os.path.join(args.save_dir, 'model.pkl')
if not os.path.exists(model_path):
# model = RidgeCV()
# model = LassoCV(tol=0.1)
# grid = {
# 'kernel': ['rbf', 'laplacian', 'sigmoid', 'polynomial'],
# 'alpha': [10.0**i for i in range (-3, 2)],
# }
# model = GridSearchCV(
# KernelRidge(),
# grid,
# cv=args.folds,
# scoring='r2',
# )
# grid = {
# 'kernel': ['rbf', 'sigmoid'],
# 'C': [0.1, 1.0, 10],
# 'epsilon': [0.1, 0.2, 0.3]
# }
# model = GridSearchCV(
# SVR(gamma='scale', verbose=True),
# grid,
# cv=args.folds,
# scoring='r2',
# )
grid = {
'learning_rate': [10.0**i for i in range (-2, 2)],
# 'n_estimators': [10, 50, 100, 500, 1000],
'max_depth': [1, 2, 3, 4, 5]
}
model = GridSearchCV(
GradientBoostingRegressor(
verbose=1,
n_iter_no_change=10,
n_estimators=500, # early stopping means we won't reach
),
grid,
cv=args.folds,
scoring='r2',
)
model.fit(X_train, Y_train)
with open(model_path, 'wb') as output:
pickle.dump(model, output)
else:
with open(model_path, 'rb') as inp:
model = pickle.load(inp)
Y_hat = model.predict(X_train)
print(r2_score(Y_train, Y_hat))
print(mean_squared_error(Y_train, Y_hat))
if args.predict:
X_test_raw = pd.read_csv('X_test.csv')
X_test = X_test_raw.values[:, 1:]
# print(f"Original shape of X_test: {X_test.shape}")
print("Running pipeline on X_test...")
X_test = scaler.transform(X_test)
X_test = imp.transform(X_test)
X_test = selector.transform(X_test)
Y_hat_test = model.predict(X_test)
print("Writing test outputs...")
output = pd.read_csv('sample.csv')
for i in range(output.shape[0]):
output.iat[i, 1] = Y_hat_test[i]
output.to_csv(os.path.join(args.save_dir, 'predictions.csv'), index=False)
import torch
import torch.nn as nn
class Network(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(Network, self).__init__()
self.linear_1 = nn.Linear(input_size, hidden_size)
self.activation = nn.Sigmoid()
self.linear_2 = nn.Linear(hidden_size, output_size)
def forward(self, x):
linear_1 = self.linear_1(x)
activation = self.activation(linear_1)
linear_2 = self.linear_2(activation)
return linear_2
import numpy as np
def compute_inliers(X_train, z=3):
"""
Returns the indices of samples where every single feature value is within
`z` standard deviations of the mean for that feature.
"""
train_mean = np.mean(X_train, axis=0)
train_stddev = np.std(X_train, axis=0)
is_in_range = np.abs(X_train - train_mean) <= z * train_stddev
in_range_indices = np.nonzero(
np.sum(is_in_range, axis=1) == is_in_range.shape[1])[0]
return in_range_indices
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment