# classifier using all dataset

from pprint import pprint

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, make_scorer, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import ShuffleSplit

import categorical_util as cu
import config as cfg

df = pd.read_csv(cfg.get_data_train(), converters={'isStaticClass': bool})

# get features and labels
features = df.drop(columns=['index', 'fullpathname', 'classname', 'library', 'numInnerClasses', 'label'])
# print(features)
# print(features.dtypes)
labels = df.loc[:, 'label']
# print(labels)
print(labels.reset_index(name='label').groupby(['label']).size())
# features_replace = cu.get_replace_map(features)
features_replace = cu.get_one_hot_encoding(features)
# print(features_replace.dtypes)

# takes the features and labels based on config
features, labels = cfg.get_resample_config(features_replace, labels)
# print(features)
# print(features.dtypes)

# create the classifier
clf = RandomForestClassifier(n_jobs=2, n_estimators=1000)

# split the dataset
cv = ShuffleSplit(n_splits=1, test_size=0.3, random_state=50)

for train_index, test_index in cv.split(features, labels):
    # get training data
    # features_train, features_test = features[train_index], features[test_index]
    # labels_train, labels_test = labels[train_index], labels[test_index]
    features_train = features.iloc[train_index]
    features_test = features.iloc[test_index]
    labels_train = labels.iloc[train_index]
    labels_test = labels.iloc[test_index]
    path_test = df.iloc[test_index]

    print(labels_train.reset_index(name='label').groupby(['label']).size())

    # training the data
    clf.fit(features_train, labels_train)
    labels_predicted = clf.predict(features_test)

    pprint(path_test['fullpathname'].values.tolist())
    pprint(labels_test.tolist())
    pprint(labels_predicted.tolist())

    stereotypes = ['Controller', 'Coordinator', 'Information Holder', 'Interfacer', 'Service Provider', 'Structurer']
    conf_matrix = confusion_matrix(labels_test, labels_predicted, labels=stereotypes)
    pprint(conf_matrix)

    p_score = precision_score(labels_test, labels_predicted, average='weighted', labels=stereotypes)
    r_score = recall_score(labels_test, labels_predicted, average='weighted', labels=stereotypes)
    f_score = f1_score(labels_test, labels_predicted, average='weighted', labels=stereotypes)
    mcc_score = matthews_corrcoef(labels_test, labels_predicted)
    # print the scores
    print("%0.2f\t%0.2f\t%0.2f\t%0.2f"% (p_score, r_score, f_score, mcc_score))