# Some baselines to test import numpy as np import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_moons, make_circles, make_classification from sklearn.neural_network import MLPClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC, LinearSVC from sklearn.gaussian_process import GaussianProcessClassifier from sklearn.gaussian_process.kernels import RBF from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis import logging import time import csv from sklearn.metrics import accuracy_score import sys from config import config def cross_validate_kNN(X, y): logging.info("Cross-validation KNN...") classifier = KNeighborsClassifier(weights='uniform', algorithm='auto', n_jobs=-1) parameters_KNN = {'n_neighbors': [5, 25, 100], 'leaf_size': [10, 50]} cross_validate(classifier=classifier, parameters=parameters_KNN, X=X, y=y) def cross_validate_SVC(X, y): logging.info("Cross-validation SVC...") classifier = LinearSVC(tol=1e-2, max_iter=500) parameters_SVC = {'C': [0.01, 0..1, 1, 10, 100]} cross_validate(classifier=classifier, parameters=parameters_SVC, X=X, y=y) def cross_validate_RFC(X, y): logging.info("Cross-validation RFC...") classifier = RandomForestClassifier(max_features='auto', random_state=42, n_jobs=-1) parameters_RFC = {'n_estimators': [10, 50, 100, 1000], 'max_depth': [5, 10, 50, 100, 500], 'min_samples_split' : [0.1, 0.4, 0.7, 1.0], 'min_samples_leaf' : [0.1, 0.5]} cross_validate(classifier=classifier, parameters=parameters_RFC, X=X, y=y) def cross_validate(classifier, parameters, X, y): X = X.reshape((36223, 500 * 129)) clf = GridSearchCV(classifier, parameters, scoring='accuracy', verbose=3, cv=2) clf.fit(X, y.ravel()) export_dict(clf.cv_results_['mean_fit_time'], clf.cv_results_['std_fit_time'], clf.cv_results_['mean_score_time'], clf.cv_results_['std_score_time'], clf.cv_results_['mean_test_score'], clf.cv_results_['std_test_score'], \ clf.cv_results_['params'], file_name='CrossValidation_results.csv', first_row=('mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'mean_test_score', 'std_test_score', 'params')) best_columns = ('BEST ESTIMATOR', 'BEST SCORE', 'BEST PARAMS') export_dict([clf.best_estimator_], [clf.best_score_], [clf.best_params_], file_name='best_results.csv', first_row=best_columns) def try_sklearn_classifiers(X, y): logging.info("Training the simple classifiers: kNN, Linear SVM, Random Forest and Naive Bayes.") names = [# "Nearest Neighbors", # "Linear SVM", # "Random Forest", # "Naive Bayes", "Linear SVM" ] classifiers = [ # KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, n_jobs=-1), # LinearSVC(tol=1e-5, C=1, random_state=42, max_iter=1000), # RandomForestClassifier(n_estimators=30, max_depth=20, max_features='auto', random_state=42, n_jobs=-1), # GaussianNB(), LinearSVC(tol=1e-3, C=20, random_state=42, max_iter=500) ] X = X.reshape((36223, 500 * 129)) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) scores = [] runtimes = [] for name, clf in zip(names, classifiers): logging.info(name) start_time = time.time() clf.fit(X_train, y_train.ravel()) score = clf.score(X_test, y_test.ravel()) scores.append(score) runtime = (time.time() - start_time) runtimes.append(runtime) logging.info("--- Score: %s " % score) logging.info("--- Runtime: %s for seconds ---" % runtime) export_dict(names, scores, runtimes, first_row=('Model', 'Score', 'Runtime'), file_name='classifiers_results.csv') def export_dict(*columns, first_row, file_name): rows = zip(*columns) file = config['model_dir'] + '/' + file_name with open(file, "w") as f: writer = csv.writer(f) writer.writerow(first_row) for row in rows: writer.writerow(row)