Skip to content
Snippets Groups Projects
Commit bb2bb975 authored by Michael Baumgartner's avatar Michael Baumgartner
Browse files

Fine tune DBSCAN outlier removal.

parent 7dfa5d62
No related branches found
No related tags found
No related merge requests found
......@@ -29,7 +29,7 @@ import umap
# Global variables
IMPUTER_FIXED = True
FEATURE_FIXED = True
NUM_FEATURES_FIXED = False
NUM_FEATURES_FIXED = True
OUTLIER_FIXED = True
GMM_COMPONENTS_FIXED = True
USE_PEARSON = False # Use correlation or not, if false, use pearson fixed true
......@@ -45,7 +45,7 @@ UMAP_NEIGHBOURS = 2 # Number of neighbors considered for the manifold approximat
# Fixed parameters
IMPUTER = ['mean']
FEATURE_SELECTOR = ['selectkbest']
NUM_FEATURES = 200
NUM_FEATURES = 150
OUTLIER_DETECTOR = ['DBS']
PEARSON_THRESHOLD = 0.93
GMM_COMPONENTS = 3
......@@ -386,7 +386,7 @@ class RegressionModel(object):
embedding = PCA(n_components=2).fit_transform(X_train)
db = DBSCAN(eps=2.8, min_samples=5).fit(embedding)
db = DBSCAN(eps=2.9, min_samples=5).fit(embedding)
inlier_index = db.core_sample_indices_
X_train = X_train[inlier_index]
......@@ -443,7 +443,7 @@ def main():
scaler = StandardScaler()
train_features_pre_imp = scaler.fit_transform(train_features)
test_features_pre_imp = scaler.transform(test_features)
# Remove features with >50% missing values
# Remove features with >50% missing values/zeros
indeces_of_zerovec = [104, 129, 143, 489, 530]
train_features_pre_imp= np.delete(train_features_pre_imp, indeces_of_zerovec,axis=1)
test_features_pre_imp = np.delete(test_features_pre_imp, indeces_of_zerovec,axis=1)
......@@ -594,6 +594,9 @@ def main():
# print(f"Shape of training features before outlier detection: {train_features_imp.shape}")
# print(f"Shape of training features after outlier detection: {train_features_out.shape}")
# TODO: Repeat the imputation and feature selection step on the original data without outliers
# iterate through models
if MODEL_FIXED:
models = MODEL
......@@ -614,7 +617,7 @@ def main():
# TODO: Do GridSearch
# TODO: Maybe need to determine all params in the param_grid, not only the ones we want to explore
# TODO: Maybe need to set the scoring function to the R2 score
search = GridSearchCV(model.model, param_grid, cv=cross_val, n_jobs=1, scoring='r2')
search = GridSearchCV(model.model, param_grid, cv=cross_val, n_jobs=1, scoring='r2', verbose=3)
# Actually perform the repeated k-fold cross validation search
# results will contain the best parameters for the current problem
results = search.fit(train_features_out, train_GT_out)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment