Commit 10421584 authored by Feliks Kiszkurno's avatar Feliks Kiszkurno
Browse files

Fixed bug, that made labeled data achieve unrealistically good results. :(

parent 745f4383
......@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"metadata": {
"collapsed": true
},
......@@ -13,10 +13,10 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"outputs": [],
"source": [
"data = pd.read_csv('../results/data/hor_06.csv')\n",
"data = pd.read_csv('../results/data/hor1_06.csv')\n",
"classesn = data['CLASSN']"
],
"metadata": {
......@@ -28,23 +28,15 @@
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (<ipython-input-11-e87ec824444d>, line 12)",
"output_type": "error",
"traceback": [
"\u001B[0;36m File \u001B[0;32m\"<ipython-input-11-e87ec824444d>\"\u001B[0;36m, line \u001B[0;32m12\u001B[0m\n\u001B[0;31m labels = [labels[idx] = labels_translator[key] for idx in id_new]\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n"
]
}
],
"execution_count": 7,
"outputs": [],
"source": [
"labels_translator = {0: 'Very Low',\n",
" 1: 'Low',\n",
" 2: 'Medium',\n",
" 3: 'High',\n",
" 4: 'Very High'}\n",
"\n",
"labels = [None] * len(classesn)\n",
"find_all = lambda x, xs: [i for (y, i) in zip(xs, range(len(xs))) if x == y]\n",
"for key in labels_translator.keys():\n",
......
......@@ -14,15 +14,15 @@ import slopestabilityML
def ask_committee(ml_result_class, test_results, *, random_seed=False):
results = np.array([])
classes_correct = {}
for test_name in sorted(test_results.keys()):
if settings.settings['norm_class'] is True:
class_in = test_results[test_name]['CLASSN']
elif settings.settings['norm_class'] is False:
elif settings.settings['norm_class'] is False and settings.settings['use_labels'] is False:
class_in = test_results[test_name]['CLASS']
elif settings.settings['use_labels'] is True:
class_in = test_results[test_name]['LABELS']
class_in = slopestabilitytools.label2numeric(class_in)
else:
print('I don\'t know which class to use! Exiting...')
......@@ -79,7 +79,7 @@ def ask_committee(ml_result_class, test_results, *, random_seed=False):
for test_group in sorted(tests_ordered.keys()):
for test_name in tests_ordered[test_group]:
classes_correct_temp = classes_correct[test_name].to_numpy()
classes_correct_temp = np.array(classes_correct[test_name])
classes_correct_temp = classes_correct_temp.reshape([len(results_voting[test_name]), 1])
score = len(np.argwhere(results_voting[test_name] == classes_correct_temp)) / len(classes_correct_temp)
if test_group is 'train':
......
......@@ -13,14 +13,14 @@ import pandas as pd
def preprocess_data(data_set):
if settings.settings['norm'] is True:
x_train = data_set.drop(['X', 'Z', 'INM', 'INMN', 'RES', 'CLASS', 'CLASSN'], axis='columns')
x_train = data_set.drop(['X', 'Z', 'INM', 'INMN', 'RES', 'CLASS', 'CLASSN', 'LABELS'], axis='columns')
else:
x_train = data_set.drop(['X', 'Z', 'INM', 'INMN', 'RESN', 'CLASS', 'CLASSN'], axis='columns')
x_train = data_set.drop(['X', 'Z', 'INM', 'INMN', 'RESN', 'CLASS', 'CLASSN', 'LABELS'], axis='columns')
if settings.settings['sen'] is False:
x_train = x_train.drop(['SEN'], axis='columns')
if settings.settings['norm_class'] is True:
if settings.settings['norm_class'] is True and settings.settings['use_labels'] is False:
y_train = pd.DataFrame(data_set['CLASSN'])
elif settings.settings['use_labels'] is True:
y_train = pd.DataFrame(data_set['LABELS'])
......
......@@ -65,8 +65,8 @@ def run_classification(test_training, test_prediction, test_results, clf, clf_na
cat_feat = ['LABELS']
cat_lab = ['Very Low', 'Low', 'Medium', 'High', 'Very High']
cat_trans = OneHotEncoder(categories=[cat_lab])
preprocessor = ColumnTransformer(transformers=[('num', num_trans, num_feat),
('cat', cat_trans, cat_feat)])
preprocessor = ColumnTransformer(transformers=[('num', num_trans, num_feat)])
#('cat', cat_trans, cat_feat)])
else:
preprocessor = ColumnTransformer(transformers=[('num', num_trans, num_feat)])
......@@ -97,25 +97,18 @@ def run_classification(test_training, test_prediction, test_results, clf, clf_na
for test_name_pred in test_prediction:
# Prepare data
x_question, y_answer = slopestabilityML.preprocess_data(test_results[test_name_pred])
# y_pred = clf_pipeline.score(x_question, y_answer)
y_pred = clf_pipeline.predict(x_question)
result_class[test_name_pred] = y_pred
# print(y_pred)
score1 = clf_pipeline.score(x_question, y_answer)
score = accuracy_score(y_answer, y_pred)
if score1 == score:
print('MATCH!')
else:
print('MISMATCH!')
print('score: {score:.2f} %'.format(score=score*100))
if settings.settings['norm_class'] is True:
class_in = test_results[test_name_pred]['CLASSN']
elif settings.settings['norm_class'] is False:
elif settings.settings['norm_class'] is False and settings.settings['use_labels'] is False:
class_in = test_results[test_name_pred]['CLASS']
elif settings.settings['use_labels'] is True:
class_in = test_results[test_name]['LABELS']
class_in = test_results[test_name_pred]['LABELS']
else:
print('I don\'t know which class to use! Exiting...')
exit(0)
......
......@@ -17,7 +17,7 @@ def import_tests(abs_path=''):
test_names = slopestabilitytools.datamanagement.test_list('.csv', abs_path=abs_path)
for test_name in test_names:
test_result_curr = pd.read_csv(abs_path + settings.settings['data_folder'] + '/' + test_name + '.csv', index_col=0)
test_result_curr = pd.read_csv(abs_path + settings.settings['data_folder'] + test_name + '.csv', index_col=0)
test_results.update({test_name: test_result_curr})
return test_results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment