Fine tune DBSCAN outlier removal.

bb2bb975 · Michael Baumgartner · 7dfa5d62 · bb2bb975
Commit bb2bb975 authored 2 years ago by Michael Baumgartner
--- a/task1/solution.py
+++ b/task1/solution.py
@@ -29,7 +29,7 @@ import umap
 # Global variables
 IMPUTER_FIXED = True
 FEATURE_FIXED = True
-NUM_FEATURES_FIXED = False
+NUM_FEATURES_FIXED = True
 OUTLIER_FIXED = True
 GMM_COMPONENTS_FIXED = True
 USE_PEARSON = False # Use correlation or not, if false, use pearson fixed true
@@ -45,7 +45,7 @@ UMAP_NEIGHBOURS = 2 # Number of neighbors considered for the manifold approximat
 # Fixed parameters
 IMPUTER = ['mean']
 FEATURE_SELECTOR = ['selectkbest']
-NUM_FEATURES = 200
+NUM_FEATURES = 150
 OUTLIER_DETECTOR = ['DBS']
 PEARSON_THRESHOLD = 0.93
 GMM_COMPONENTS = 3
@@ -386,7 +386,7 @@ class RegressionModel(object):
        embedding = PCA(n_components=2).fit_transform(X_train)
        

-        db = DBSCAN(eps=2.8, min_samples=5).fit(embedding)
+        db = DBSCAN(eps=2.9, min_samples=5).fit(embedding)

        inlier_index = db.core_sample_indices_
        X_train = X_train[inlier_index]
@@ -443,7 +443,7 @@ def main():
    scaler = StandardScaler()
    train_features_pre_imp = scaler.fit_transform(train_features)
    test_features_pre_imp = scaler.transform(test_features) 
-    # Remove features with >50% missing values
+    # Remove features with >50% missing values/zeros
    indeces_of_zerovec = [104, 129, 143, 489, 530]
    train_features_pre_imp= np.delete(train_features_pre_imp, indeces_of_zerovec,axis=1)
    test_features_pre_imp = np.delete(test_features_pre_imp, indeces_of_zerovec,axis=1)
@@ -594,6 +594,9 @@ def main():
                                    # print(f"Shape of training features before outlier detection: {train_features_imp.shape}")
                                    # print(f"Shape of training features after outlier detection: {train_features_out.shape}")

+                                    # TODO: Repeat the imputation and feature selection step on the original data without outliers
+                                    
+
                                    # iterate through models
                                    if MODEL_FIXED:
                                        models = MODEL
@@ -614,7 +617,7 @@ def main():
                                        # TODO: Do GridSearch
                                        # TODO: Maybe need to determine all params in the param_grid, not only the ones we want to explore
                                        # TODO: Maybe need to set the scoring function to the R2 score
-                                        search = GridSearchCV(model.model, param_grid, cv=cross_val, n_jobs=1, scoring='r2')
+                                        search = GridSearchCV(model.model, param_grid, cv=cross_val, n_jobs=1, scoring='r2', verbose=3)
                                        # Actually perform the repeated k-fold cross validation search
                                        # results will contain the best parameters for the current problem
                                        results = search.fit(train_features_out, train_GT_out)