sklearnclassifier.py 4.43 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# Some baselines to test

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import logging
import time
import csv
from sklearn.metrics import accuracy_score

import sys

from config import config

def cross_validate_kNN(X, y):
    logging.info("Cross-validation KNN...")
    classifier = KNeighborsClassifier(weights='uniform', algorithm='auto', n_jobs=-1)
Ard Kastrati's avatar
Ard Kastrati committed
30
    parameters_KNN = {'n_neighbors': [5, 25, 100], 'leaf_size': [10, 50]}
31
32
33
34
35
36
37
38
39
40
41
    cross_validate(classifier=classifier, parameters=parameters_KNN, X=X, y=y)

def cross_validate_SVC(X, y):
    logging.info("Cross-validation SVC...")
    classifier =  LinearSVC(tol=1e-5)
    parameters_SVC = {'C': [1, 10, 100, 1000, 10000], 'max_iter': [1000, 10000, 100000]}
    cross_validate(classifier=classifier, parameters=parameters_SVC, X=X, y=y)

def cross_validate_RFC(X, y):
    logging.info("Cross-validation RFC...")
    classifier = RandomForestClassifier(max_features='auto', random_state=42, n_jobs=-1)
42
    parameters_RFC = {'n_estimators': [10, 50, 100, 1000], 'max_depth': [5, 10, 50, 100, 500], 'min_samples_split' : [0.1, 0.4, 0.7, 1.0], 'min_samples_leaf' : [0.1, 0.5]}
43
44
45
    cross_validate(classifier=classifier, parameters=parameters_RFC, X=X, y=y)

def cross_validate(classifier, parameters, X, y):
46
    X = X.reshape((36223, 500 * 129))
Ard Kastrati's avatar
Ard Kastrati committed
47
    clf = GridSearchCV(classifier, parameters, scoring='accuracy', n_jobs=-1, verbose=3, cv=2)
48
49
50
51
52
53
54
55
56
57
58
59
60
    clf.fit(X, y.ravel())

    export_dict(clf.cv_results_['mean_fit_time'], clf.cv_results_['std_fit_time'], clf.cv_results_['mean_score_time'],
                clf.cv_results_['std_score_time'], clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], \
                clf.cv_results_['params'], file_name='CrossValidation_results.csv',
                first_row=('mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
                           'mean_test_score', 'std_test_score', 'params'))

    best_columns = ('BEST ESTIMATOR', 'BEST SCORE', 'BEST PARAMS')
    export_dict([clf.best_estimator_], [clf.best_score_], [clf.best_params_], file_name='best_results.csv', first_row=best_columns)

def try_sklearn_classifiers(X, y):
    logging.info("Training the simple classifiers: kNN, Linear SVM, Random Forest and Naive Bayes.")
61
62
    names = [# "Nearest Neighbors",
             # "Linear SVM",
Ard Kastrati's avatar
Ard Kastrati committed
63
64
             # "Random Forest",
             # "Naive Bayes",
65
             "Linear SVM"
66
67
68
             ]

    classifiers = [
69
70
        # KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, n_jobs=-1),
        # LinearSVC(tol=1e-5, C=1, random_state=42, max_iter=1000),
Ard Kastrati's avatar
Ard Kastrati committed
71
72
73
        # RandomForestClassifier(n_estimators=30, max_depth=20, max_features='auto', random_state=42, n_jobs=-1),
        # GaussianNB(),
        LinearSVC(tol=1e-3, C=20, random_state=42, max_iter=500)
74
75
        ]

76
    X = X.reshape((36223, 500 * 129))
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

    scores = []
    runtimes = []
    for name, clf in zip(names, classifiers):
        logging.info(name)
        start_time = time.time()
        clf.fit(X_train, y_train.ravel())

        score = clf.score(X_test, y_test.ravel())
        scores.append(score)

        runtime = (time.time() - start_time)
        runtimes.append(runtime)

        logging.info("--- Score: %s " % score)
        logging.info("--- Runtime: %s for seconds ---" % runtime)
    export_dict(names, scores, runtimes, first_row=('Model', 'Score', 'Runtime'), file_name='classifiers_results.csv')

def export_dict(*columns, first_row, file_name):
    rows = zip(*columns)
    file = config['model_dir'] + '/' + file_name
    with open(file, "w") as f:
        writer = csv.writer(f)
        writer.writerow(first_row)
        for row in rows:
103
            writer.writerow(row)