import pandas as pd
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.model_selection import cross_validate, StratifiedKFold
import numpy as np

import categorical_util as cu
import config as cfg

# df = pd.read_csv(cfg.get_data_train(), converters={'isStaticClass': bool})
df = pd.read_csv(cfg.get_data_train_binary(), converters={'isStaticClass': bool})

# get features and labels
features = df.drop(columns=['index', 'fullpathname', 'classname', 'library', 'numInnerClasses', 'label'])
# print(features)
# print(features.dtypes)
labels = df.loc[:, 'label']
# print(labels)

# features_replace = cu.get_replace_map(features)
features_replace = cu.get_one_hot_encoding(features)
# print(features_replace.dtypes)

# takes the features and labels based on config
features, labels = cfg.get_resample_config(features_replace, labels)

# classifier
clf = cfg.get_classifier()

# calculate the score
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=100)
scoring = {'precision': 'precision_weighted',
           'recall': 'recall_weighted',
           'f1': 'f1_weighted',
           'mcc': make_scorer(matthews_corrcoef)
           }
scores = cross_validate(clf, features, labels, scoring=scoring, cv=skf)
# print(scores.keys())
# print(clf.feature_importance_)

print("PRECISION SCORE++++++++++++++++++++++++++")
precision_scores = scores['test_precision']
for score in precision_scores:
    print(score)
print("Means: %0.2f"% (np.mean(precision_scores)))
print("Standard Deviation: %0.2f"% (np.std(precision_scores)))

print("RECALL SCORE++++++++++++++++++++++++++")
recall_scores = scores['test_recall']
for score in recall_scores:
    print(score)
print("Means: %0.2f"% (np.mean(recall_scores)))
print("Standard Deviation: %0.2f"% (np.std(recall_scores)))

print("f1 SCORE++++++++++++++++++++++++++")
f1_scores = scores['test_f1']
for score in f1_scores:
    print(score)
print("Means: %0.2f"% (np.mean(f1_scores)))
print("Standard Deviation: %0.2f"% (np.std(f1_scores)))

print("mcc SCORE++++++++++++++++++++++++++")
mcc_scores = scores['test_mcc']
for score in mcc_scores:
    print(score)
print("Means: %0.2f"% (np.mean(mcc_scores)))
print("Standard Deviation: %0.2f"% (np.std(mcc_scores)))
