Source code for evalne.evaluation.score

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018

# This file contains methods and classes that simplify the management and storage of evaluation results, both for
# individual methods as well as complete evaluations.

import os
import pickle
import warnings
import numpy as np
import pandas as pd

from collections import Counter
from collections import OrderedDict
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.linear_model import LogisticRegression

from evalne.utils import viz_utils as viz


[docs]class Scoresheet: """ Class that simplifies the logging and management of evaluation results and execution times. Functions for logging, plotting and writing the results to files are provided. The Scoresheet does not log the complete train or test model predictions. Parameters ---------- tr_te : string, optional A string indicating if the 'train' or 'test' results should be stored. Default is 'test'. precatk_vals : list of int or None, optional The values for which the precision at k should be computed. Default is None. """ def __init__(self, tr_te='test', precatk_vals=None): self._tr_te = tr_te self._precatk_vals = precatk_vals self._scoresheet = OrderedDict() self._all_methods = OrderedDict()
[docs] def log_results(self, results): """ Logs in the Scoresheet all the performance metrics (and execution time) extracted from the input Results object or list of Results objects. Multiple Results for the same method on the same network can be provided and will all be stored (these are assumed to correspond to different repetitions of the experiment). Parameters ---------- results : Results or list of Results The Results object or objects to be logged in the Scoresheet. Examples -------- Evaluate the common neighbours baseline and log the train and test results: >>> tr_scores = Scoresheet(tr_te='train') >>> te_scores = Scoresheet(tr_te='test') >>> result = nee.evaluate_baseline(method='common_neighbours') >>> tr_scores.log_results(result) >>> te_scores.log_results(result) """ if isinstance(results, Results): self._log_result(results) else: for res in results: self._log_result(res)
def _log_result(self, result): """ Logs in the Scoresheet all the performance metrics (and execution time) extracted from the input Results object. Parameters ---------- result : Results The Results object to be logged in the Scoresheet. """ # Get the dictionary keys k1 = result.params['nw_name'] # First key is network name k2 = result.method # Second key is method name self._all_methods[k2] = 0 # Store the results if k1 in self._scoresheet: # Dataset exists in the dictionary, so we extend it if k2 in self._scoresheet[k1]: # Method exists in the dictionary, so we extend its metrics with vals of new exp repeat metrics, vals = result.get_all(self._tr_te, self._precatk_vals) for i in range(len(metrics)): self._scoresheet[k1][k2][metrics[i]].append(np.around(vals[i], 4)) self._scoresheet[k1][k2]['eval_time'].append(result.params['eval_time']) self._scoresheet[k1][k2]['edge_embed_method'].append(result.params.get('edge_embed_method', 'None')) else: # Method is not yet in the dict, so we add method and metrics metrics, vals = result.get_all(self._tr_te, self._precatk_vals) self._scoresheet[k1][k2] = OrderedDict(zip(metrics, map(lambda x: [np.around(x, 4)], vals))) self._scoresheet[k1][k2]['eval_time'] = [result.params['eval_time']] self._scoresheet[k1][k2]['edge_embed_method'] = [result.params.get('edge_embed_method', 'None')] else: # Dataset is not yet in the dict, so we add dataset, method and metrics metrics, vals = result.get_all(self._tr_te, self._precatk_vals) aux = OrderedDict(zip(metrics, map(lambda x: [np.around(x, 4)], vals))) self._scoresheet[k1] = OrderedDict({k2: aux}) self._scoresheet[k1][k2]['eval_time'] = [result.params['eval_time']] self._scoresheet[k1][k2]['edge_embed_method'] = [result.params.get('edge_embed_method', 'None')]
[docs] def get_pandas_df(self, metric='auroc', repeat=None): """ Returns a view of the Scoresheet as a pandas DataFrame for the specified metric. The columns of the DataFrame represent different networks and the rows different methods. If multiple Results for the same network/method combination were logged (multiple repetitions of the experiment), one can select any of these repeats or get the average over all. Parameters ---------- metric : string, optional Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout', 'miss', 'accuracy', 'f_score', 'eval_time' or 'edge_embed_method'. Default is 'auroc'. repeat : int, optional An int indicating the experiment repeat for which the results should be returned. If not indicated, the average over all repeats will be computed and returned. Default is None (computes average over repeats). Returns ------- df : DataFrame A pandas DataFrame view of the Scoresheet for the specified metric. Raises ------ ValueError If the requested metric does not exist. If the Scoresheet is empty so a DataFrame can not be generated. Notes ----- For uncountable 'metrics' such as the node pair embedding operator (i.e 'edge_embed_method'), avg returns the most frequent item in the vector. Examples -------- Read a scoresheet and get the auroc scores as a pandas DataFrame >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb')) >>> df = scores.get_pandas_df() >>> df Network_1 Network_2 katz 0.8203 0.8288 common_neighbours 0.3787 0.3841 jaccard_coefficient 0.3787 0.3841 Read a scoresheet and get the f scores of the first repetition of the experiment >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb')) >>> df = scores.get_pandas_df('f_score', repeat=0) >>> df Network_1 Network_2 katz 0 0 common_neighbours 0.7272 0.7276 jaccard_coefficient 0.7265 0.7268 """ if len(self._scoresheet) != 0: nw = next(iter(self._scoresheet)) if metric not in iter(self._scoresheet[nw][next(iter(self._scoresheet[nw]))].keys()): raise ValueError('Requested metric `{}` does not exist!'.format(metric)) else: raise ValueError('Scoresheet is empty, can not generate pandas df! Try logging some results first.') cols = self._scoresheet.keys() rows = list(self._all_methods) df = pd.DataFrame(index=rows, columns=cols) for k1 in cols: for k2 in rows: d = self._scoresheet[k1].get(k2) if d is not None: if repeat is None: if metric == 'edge_embed_method': count = Counter(d.get(metric)) df[k1][k2] = count.most_common(1)[0][0] else: df[k1][k2] = np.around(np.mean(np.array(d.get(metric))), 4) else: arr = d.get(metric) if len(arr) >= repeat+1: df[k1][k2] = d.get(metric)[repeat] else: df[k1][k2] = None return df
[docs] def get_latex(self, metric='auroc'): """ Returns a view of the Scoresheet as a Latex table for the specified metric. The columns of the table represent different networks and the rows different methods. If multiple Results for the same network/method combination were logged (multiple repetitions of the experiment), the average is returned. Parameters ---------- metric : string, optional Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout', 'miss', 'accuracy', 'f_score', 'eval_time' or 'edge_embed_method'. Default is 'auroc'. Returns ------- latex_table : string A latex table as a string. """ df = self.get_pandas_df(metric) return df.to_latex()
[docs] def print_tabular(self, metric='auroc'): """ Prints a tabular view of the Scoresheet for the specified metric. The columns of the table represent different networks and the rows different methods. If multiple Results for the same network/method combination were logged (multiple repetitions of the experiment), the average is showed. Parameters ---------- metric : string, optional Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout', 'miss', 'accuracy', 'f_score', 'eval_time' or 'edge_embed_method'. Default is 'auroc'. Examples -------- Read a scoresheet and get the average execution times over all experiment repeats as tabular output: >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb')) >>> scores.print_tabular('eval_time') Network_1 Network_2 katz 0.0350 0.0355 common_neighbours 0.0674 0.0676 jaccard_coefficient 0.6185 0.6693 """ print(self.get_pandas_df(metric))
[docs] def write_tabular(self, filename, metric='auroc'): """ Writes a tabular view of the Scoresheet for the specified metric to a file. The columns of the table represent different networks and the rows different methods. If multiple Results for the same network/method combination were logged (multiple repetitions of the experiment), the average is used. Parameters ---------- filename : string A file where to store the results. metric : string, optional Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout', 'miss', 'accuracy', 'f_score' or 'eval_time'. Default is 'auroc'. """ header = '\n\nEvaluation results ({}):\n-----------------------\n'.format(metric) f = open(filename, 'a') f.write(header) df = self.get_pandas_df(metric) df.to_csv(f, sep='\t', na_rep='NA') f.close()
[docs] def write_all(self, filename, repeats='avg'): """ Writes for all networks, methods and performance metrics the corresponding values to a file. If multiple Results for the same network/method combination were logged (multiple repetitions of the experiment), the method can return the average or all logged values. Parameters ---------- filename : string A file where to store the results. repeats : string, optional Can be one of 'all', 'avg'. Default is 'avg'. Notes ----- For uncountable 'metrics' such as the node pair embedding operator (i.e 'edge_embed_method'), avg returns the most frequent item in the vector. Examples -------- Read a scoresheet and write all metrics to a file with repeats='avg': >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb')) >>> scores.write_all('./test.txt') >>> print(open('test.txt', 'rb').read()) Network_1 Network --------------------------- katz: tn: 684.0 fp: 0.0 fn: 684.0 tp: 0.0 auroc: 0.8203 ... Read a scoresheet and write all metrics to a file with repeats='all': >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb')) >>> scores.write_all('./test.txt', 'all') >>> print(open('test.txt', 'rb').read()) Network_1 Network --------------------------- katz: tn: [684 684] fp: [0 0] fn: [684 684] tp: [0 0] auroc: [0.8155 0.8252] ... """ f = open(filename, 'a+b') # Loop over all datasets for k1 in self._scoresheet: f.write(('\n\n{} Network'.format(k1)).encode()) f.write('\n---------------------------'.encode()) # Loop over all methods for k2 in self._scoresheet[k1]: f.write(('\n{}:'.format(k2)).encode()) f.write('\n '.encode()) # Loop over all metrics (auroc, pr, f_score...) for k3 in self._scoresheet[k1][k2]: if repeats == 'avg': # Compute average over all exp repeats for each metric if k3 == 'edge_embed_method': count = Counter(self._scoresheet[k1][k2][k3]) f.write((k3 + ': \t ' + count.most_common(1)[0][0] + '\n ').encode()) else: avg = np.around(np.mean(np.array(self._scoresheet[k1][k2][k3])), 4) f.write((k3 + ': \t ' + str(avg) + '\n ').encode()) else: # Report all values for each exp repeat if k3 == 'edge_embed_method': vals = self._scoresheet[k1][k2][k3] f.write((k3 + ': \t ' + str(vals) + '\n ').encode()) else: vals = np.around(np.array(self._scoresheet[k1][k2][k3]), 4) f.write((k3 + ': \t ' + str(vals) + '\n ').encode()) # Close the file f.close()
[docs] def write_pickle(self, filename): """ Writes a pickle representation of this object to a file. Parameters ---------- filename : string A file where to store the pickle representation. """ pickle.dump(self, open(filename, "wb"))
[docs]class Results(object): """ Class that encapsulates the train and test predictions of one method on a specific network and set of parameters. The train and test predictions are stored as Scores objects. Functions for plotting, printing and saving to files the train and test scores are provided. Supports binary classification only. Parameters ---------- method : string A string representing the name of the method associated with these results. params : dict A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation. train_pred : ndarray An array containing the train predictions. train_labels : ndarray An array containing the train labels. test_pred : ndarray, optional An array containing the test predictions. Default is None. test_labels : ndarray, optional An array containing the test labels. Default is None. label_binarizer : string or Sklearn binary classifier, optional If the predictions returned by the model are not binary, this parameter indicates how these binary predictions should be computed in order to be able to provide metrics such as the confusion matrix. Any Sklear binary classifier can be used or the keyword 'median' which will used the prediction medians as binarization thresholds. Default is LogisticRegression(solver='liblinear') Attributes ---------- method : string A string representing the name of the method associated with these results. params : dict A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation. binary_preds : bool A bool indicating if the train and test predictions are binary or not. train_scores : Scores A Scores object containing train scores. test_scores : Scores, optional A Scores object containing test scores. Default is None. label_binarizer : string or Sklearn binary classifier, optional If the predictions returned by the model are not binary, this parameter indicates how these binary predictions should be computed in order to be able to provide metrics such as the confusion matrix. By default, the method binarizes the predictions such that their accuracy is maximised. Any Sklearn binary classifier can be used or the keyword 'median' which will used the prediction medians as binarization thresholds. Default is LogisticRegression(solver='liblinear') Raises ------ AttributeError If the label binarizer is set to an incorrect value. """ def __init__(self, method, params, train_pred, train_labels, test_pred=None, test_labels=None, label_binarizer=LogisticRegression(solver='liblinear')): self.params = params self.method = method self.label_binarizer = label_binarizer self.binary_preds = self._check_binary(train_pred, test_pred) self.train_scores = None self.test_scores = None self._init_scores(train_pred, train_labels, test_pred, test_labels) @staticmethod def _check_binary(train_pred, test_pred): """ Method that checks if the train and test predictions are binary. Parameters ---------- train_pred : ndarray An array containing the train predictions. test_pred : ndarray, optional An array containing the test predictions. Returns ------- binary_preds : bool A bool indicating if the train and test predictions are binary or not. """ if test_pred is None: if ((train_pred == 0) | (train_pred == 1)).all(): return True else: return False else: if ((train_pred == 0) | (train_pred == 1)).all() and ((test_pred == 0) | (test_pred == 1)).all(): return True else: return False def _init_scores(self, train_pred, train_labels, test_pred, test_labels): """ Method that creates the train and test Scores objects. Parameters ---------- train_pred : ndarray An array containing the train predictions. train_labels : ndarray An array containing the train labels. test_pred : ndarray, optional An array containing the test predictions. test_labels : ndarray, optional An array containing the test labels. """ # Check if the predictions are binary or not if self.binary_preds: # Create the score objects self.train_scores = Scores(y_true=train_labels, y_pred=train_pred, y_bin=train_pred) if test_pred is not None: self.test_scores = Scores(y_true=test_labels, y_pred=test_pred, y_bin=test_pred) else: if self.label_binarizer == 'median': # Compute binarized predictions using the median th1 = np.median(train_pred) train_bin = np.where(train_pred >= th1, 1, 0) if test_pred is not None: th2 = np.median(test_pred) test_bin = np.where(test_pred >= th2, 1, 0) elif self.label_binarizer == 'prop': num_zeros = int(len(train_labels) - sum(train_labels)) train_bin = np.ones(len(train_labels)) argsrt = np.argsort(train_pred) train_bin[argsrt[:num_zeros]] = 0 if test_pred is not None: # To avoid label leakage we assume the test data has the same proportion of pos/neg elems as train test_labels = np.array(test_labels) num_zeros = int((num_zeros / len(train_labels)) * len(test_labels)) test_bin = np.ones(len(test_labels)) argsrt = np.argsort(test_pred) test_bin[argsrt[:num_zeros]] = 0 else: try: # Compute the binarized predictions self.label_binarizer.fit(train_pred.reshape(-1, 1), train_labels) train_bin = self.label_binarizer.predict(train_pred.reshape(-1, 1)) if test_pred is not None: test_bin = self.label_binarizer.predict(test_pred.reshape(-1, 1)) except AttributeError: print('The label_binarizer is set to an incorrect value! ' 'Method predictions are not binary so a correct label_binarizer is required.') raise # Create the score objects self.train_scores = Scores(y_true=train_labels, y_pred=train_pred, y_bin=train_bin) if test_pred is not None: self.test_scores = Scores(y_true=test_labels, y_pred=test_pred, y_bin=test_bin)
[docs] def plot(self, filename=None, results='auto', curve='all'): """ Plots PR or ROC curves of the train or test predictions. If a filename is provided, the method will store the plot in pdf format to a file named <filename>+'_PR.pdf' or <filename>+'_ROC.pdf'. Parameters ---------- filename : string, optional A string indicating the path and name of the file where to store the plot. If None, the plots are only shown on screen. Default is None. results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). curve : string, optional Can be one of 'all', 'pr' or 'roc'. Default is 'all' (generates both curves). Raises ------ ValueError If test results are requested but not initialized in constructor. """ # Get the appropriate train or test scores if results == 'train': scores = self.train_scores elif results == 'test': if self.test_scores is not None: scores = self.test_scores else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: results = 'test' scores = self.test_scores else: results = 'train' scores = self.train_scores if curve == 'all' or curve == 'pr': precision, recall, _ = precision_recall_curve(scores.y_true, scores.y_pred) viz.plot_curve('{}_{}_PR.pdf'.format(filename, results), recall, precision, 'Recall', 'Precision', '{} {} PR curve'.format(self.method, results)) if curve == 'all' or curve == 'roc': tolerance = 0.25 if np.sum(scores.y_true) < tolerance * len(scores.y_true) or \ np.sum(scores.y_true) > (1 - tolerance) * len(scores.y_true): warnings.warn('ROC curves are not recommended in the case of extreme class imbalance. ' 'PR curves should be preferred.', Warning) fpr, tpr, thresholds = roc_curve(scores.y_true, scores.y_pred) viz.plot_curve('{}_{}_ROC.pdf'.format(filename, results), fpr, tpr, 'False positive rate', 'True positive rate', '{} {} ROC curve'.format(self.method, results))
[docs] def save(self, filename, results='auto', precatk_vals=None): """ Writes the method name, execution parameters, and all available performance metrics (for train or test predictions) to a file. Parameters ---------- filename : string or file A file or filename where to store the output. results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). precatk_vals : list of int or None, optional The values for which the precision at k should be computed. Default is None. Raises ------ ValueError If test results are required but not initialized in constructor. See Also -------- get_all : Describes all the performance metrics that can be computed from train or test predictions. """ f = open(filename, 'a+') f.write("Method: {}".format(self.method)) f.write("\nParameters: ") for k, v in self.params.items(): f.write(str(k) + ": " + str(v) + ", ") # Get the appropriate train or test scores if results == 'train': f.write("\nTrain scores: ") elif results == 'test': if self.test_scores is not None: f.write("\nTest scores: ") else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: f.write("\nTest scores: ") else: f.write("\nTrain scores: ") metric_names, metric_vals = self.get_all(results, precatk_vals) for i in range(len(metric_names)): f.write("\n {} = {}".format(metric_names[i], metric_vals[i])) f.write("\n\n") f.close()
[docs] def pretty_print(self, results='auto', precatk_vals=None): """ Prints to screen the method name, execution parameters, and all available performance metrics (for train or test predictions). Parameters ---------- results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). precatk_vals : list of int or None, optional The values for which the precision at k should be computed. Default is None. Raises ------ ValueError If test results are requested but not initialized in constructor. See Also -------- get_all : Describes all the performance metrics that can be computed from train or test predictions. """ print("Method: {}".format(self.method)) print("Parameters: ") print(self.params.items()) # Get the appropriate train or test scores if results == 'train': print("Train scores: ") elif results == 'test': if self.test_scores is not None: print("Test scores: ") else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: print("Test scores: ") else: print("Train scores: ") metric_names, metric_vals = self.get_all(results, precatk_vals) for i in range(len(metric_names)): print("{} = {}".format(metric_names[i], metric_vals[i])) print("")
[docs] def get_all(self, results='auto', precatk_vals=None): """ Returns the names of all performance metrics that can be computed from train or test predictions and their associated values. These metrics are: 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'precisionatk', 'recall', 'fallout', 'miss', 'accuracy' and 'f_score'. Parameters ---------- results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). precatk_vals : list of int or None, optional The values for which the precision at k should be computed. Default is None. Raises ------ ValueError If test results are requested but not initialized in constructor. """ # Get the appropriate train or test scores if results == 'train': scores = self.train_scores elif results == 'test': if self.test_scores is not None: scores = self.test_scores else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: scores = self.test_scores else: scores = self.train_scores # Add the available scores metric_names = ['tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout', 'miss', 'accuracy', 'f_score'] metric_vals = [scores.tn, scores.fp, scores.fn, scores.tp, scores.auroc(), scores.average_precision(), scores.precision(), scores.recall(), scores.fallout(), scores.miss(), scores.accuracy(), scores.f_score()] # Add precision at k values if precatk_vals is not None: for i in precatk_vals: metric_names.append('prec@{}'.format(i)) metric_vals.append(scores.precisionatk(i)) return metric_names, metric_vals
[docs] def save_predictions(self, filename, results='auto'): """ Writes the method name, execution parameters, and the train or test predictions and corresponding labels to a file. Parameters ---------- filename : string or file A file or filename where to store the output. results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). Raises ------ ValueError If test results are required but not initialized in constructor. """ f = open(filename, 'a+') f.write("Method: {}".format(self.method)) f.write("\nParameters: ") for k, v in self.params.items(): f.write(str(k) + ": " + str(v) + ", ") # Get the appropriate train or test predictions if results == 'train': scores = self.train_scores f.write("\nTrain predictions | Train labels ") elif results == 'test': if self.test_scores is not None: scores = self.test_scores f.write("\nTest predictions | Test labels ") else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: scores = self.test_scores f.write("\nTest predictions | Test labels ") else: scores = self.train_scores f.write("\nTrain predictions | Train labels ") for i in range(len(scores.y_true)): f.write("\n {} {}".format(scores.y_pred[i].item(), scores.y_true[i].item())) f.write("\n") f.close()
[docs]class Scores(object): """ Class that encapsulates train or test predictions and exposes methods to compute different performance metrics. Supports binary classification only. Parameters ---------- y_true : ndarray An array containing the true labels. y_pred : ndarray An array containing the predictions. y_bin : ndarray An array containing binarized predictions. Attributes ---------- y_true : ndarray An array containing the true labels. y_pred : ndarray An array containing the predictions. y_bin : ndarray An array containing binarized predictions. tn : float The number of true negative in prediction. fp : float The number of false positives in prediction. fn : float The number of false negatives in prediction. tp : float The number of true positives in prediction. """ def __init__(self, y_true, y_pred, y_bin): self.y_true = np.array(y_true) self.y_pred = np.array(y_pred) self.y_bin = np.array(y_bin) self._sorted = sorted(zip(self.y_true, self.y_pred), key=lambda x: x[1], reverse=True) self.tn, self.fp, self.fn, self.tp = confusion_matrix(self.y_true, self.y_bin).ravel()
[docs] def precision(self): """ Computes the precision in prediction. Returns ------- precision : float The prediction precision score. """ return self.tp / (self.tp + self.fp) if (self.tp + self.fp) != 0 else float('NaN')
[docs] def precisionatk(self, k=100): """ Computes the precision at k score. Parameters ---------- k : int, optional The k value for which to compute the precision score. Default is 100. Returns ------- precisionatk : float The prediction precision score for value k. """ if k > len(self._sorted): MAX = len(self._sorted) else: MAX = k aux = list(zip(*self._sorted))[0] rel = sum(aux[:MAX]) return (1.0 * rel) / k if k != 0 else float('NaN')
[docs] def average_precision(self): """ Computes the average precision score. Returns ------- avgprec : float The average precision score. """ return average_precision_score(self.y_true, self.y_pred)
[docs] def recall(self): """ Computes the recall in prediction. Returns ------- recall : float The prediction recall score. """ return self.tp / (self.tp + self.fn) if (self.tp + self.fn) != 0 else float('NaN')
[docs] def fallout(self): """ Computes the fallout in prediction. Returns ------- fallout : float The prediction fallout score. """ return self.fp / (self.fp + self.tn) if (self.fp + self.tn) != 0 else float('NaN')
[docs] def miss(self): """ Computes the miss in prediction. Returns ------- miss : float The prediction miss score. """ return self.fn / (self.fn + self.tn) if (self.fn + self.tn) != 0 else float('NaN')
[docs] def accuracy(self): """ Computes the accuracy score. Returns ------- accuracy : float The prediction accuracy score. """ return accuracy_score(self.y_true, self.y_bin)
[docs] def f_score(self, beta=1): """ Computes the F-score as the weighted harmonic mean of precision and recall. Parameters ---------- beta : float, optional Allows to assign more weight to precision or recall. If beta > 1, recall is emphasized over precision. If beta < 1, precision is emphasized over recall. Returns ------- f_score : float The prediction f_score. Notes ----- The generalized form is used, where P and R represent precision and recall, respectively: .. math:: F = (\\beta^2 + 1) \\cdot P \\cdot R / (\\beta^2 \\cdot P + R) F = (\\beta^2 + 1) \\cdot tp / ((\\beta^2 + 1) \\cdot tp + \\beta^2 \\cdot fn + fp) """ beta2 = beta ** 2 beta2_tp = (beta2 + 1) * self.tp den = (beta2_tp + beta2 * self.fn + self.fp) return beta2_tp / den if den != 0 else float('NaN')
[docs] def auroc(self): """ Computes the Area Under the Receiver Operating Characteristic Curve (ROC AUC). Returns ------- auroc : float The prediction auroc score. Notes ----- Throws a warning if class imbalance is detected. """ tolerance = 0.1 if np.sum(self.y_true) < tolerance * len(self.y_true) or \ np.sum(self.y_true) > (1 - tolerance) * len(self.y_true): warnings.warn('AUROC is not recommended in the case of extreme class imbalance. ', Warning) return roc_auc_score(self.y_true, self.y_pred)
[docs]class NCResults(object): """ Class that encapsulates the train and test predictions of one method on a specific network and set of parameters. The train and test predictions are stored as NCScores objects. Functions for plotting, printing and saving to files the train and test scores are provided. Supports multi-label classification. Parameters ---------- method : string A string representing the name of the method associated with these results. params : dict A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation. train_pred : ndarray An array containing the train predictions. train_labels : ndarray An array containing the train labels. test_pred : ndarray, optional An array containing the test predictions. Default is None. test_labels : ndarray, optional An array containing the test labels. Default is None. Attributes ---------- method : string A string representing the name of the method associated with these results. params : dict A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation. train_scores : Scores An NCScores object containing train scores. test_scores : Scores, optional An NCScores object containing test scores. Default is None. """ def __init__(self, method, params, train_pred, train_labels, test_pred=None, test_labels=None): self.params = params self.method = method self.train_scores = None self.test_scores = None self._init_scores(train_pred, train_labels, test_pred, test_labels) def _init_scores(self, train_pred, train_labels, test_pred, test_labels): """ Method that creates the train and test NCScores objects. Parameters ---------- train_pred : ndarray An array containing the train predictions. train_labels : ndarray An array containing the train labels. test_pred : ndarray, optional An array containing the test predictions. test_labels : ndarray, optional An array containing the test labels. """ # Create the NCScores self.train_scores = NCScores(y_true=train_labels, y_pred=train_pred) if test_pred is not None: self.test_scores = NCScores(y_true=test_labels, y_pred=test_pred)
[docs] def save(self, filename, results='auto'): """ Writes the method name, execution parameters, and all available performance metrics (for train or test predictions) to a file. Parameters ---------- filename : string or file A file or filename where to store the output. results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). Raises ------ ValueError If test results are required but not initialized in constructor. See Also -------- get_all : Describes all the performance metrics that can be computed from train or test predictions. """ f = open(filename, 'a+') f.write("Method: {}".format(self.method)) f.write("\nParameters: ") for k, v in self.params.items(): f.write(str(k) + ": " + str(v) + ", ") # Get the appropriate train or test scores if results == 'train': f.write("\nTrain scores: ") elif results == 'test': if self.test_scores is not None: f.write("\nTest scores: ") else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: f.write("\nTest scores: ") else: f.write("\nTrain scores: ") metric_names, metric_vals = self.get_all(results) for i in range(len(metric_names)): f.write("\n {} = {}".format(metric_names[i], metric_vals[i])) f.write("\n\n") f.close()
[docs] def pretty_print(self, results='auto'): """ Prints to screen the method name, execution parameters, and all available performance metrics (for train or test predictions). Parameters ---------- results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). Raises ------ ValueError If test results are requested but not initialized in constructor. See Also -------- get_all : Describes all the performance metrics that can be computed from train or test predictions. """ print("Method: {}".format(self.method)) print("Parameters: ") print(self.params.items()) # Get the appropriate train or test scores if results == 'train': print("Train scores: ") elif results == 'test': if self.test_scores is not None: print("Test scores: ") else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: print("Test scores: ") else: print("Train scores: ") metric_names, metric_vals = self.get_all(results) for i in range(len(metric_names)): print("{} = {}".format(metric_names[i], metric_vals[i])) print("")
[docs] def get_all(self, results='auto', precatk_vals=None): """ Returns the names of all performance metrics that can be computed from train or test predictions and their associated values. These metrics are: 'f1_micro', 'f1_macro', 'f1_weighted'. Parameters ---------- results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). precatk_vals : None, optional Not used. Raises ------ ValueError If test results are requested but not initialized in constructor. """ # Get the appropriate train or test scores if results == 'train': scores = self.train_scores elif results == 'test': if self.test_scores is not None: scores = self.test_scores else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: scores = self.test_scores else: scores = self.train_scores # Add the available scores metric_names = ['f1_micro', 'f1_macro', 'f1_weighted'] metric_vals = [scores.f1_micro(), scores.f1_macro(), scores.f1_weighted()] return metric_names, metric_vals
[docs] def save_predictions(self, filename, results='auto'): """ Writes the method name, execution parameters, and the train or test predictions to a file. Parameters ---------- filename : string or file A file or filename where to store the output. results : string, optional A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects 'test' if test predictions are logged and 'train' otherwise). Raises ------ ValueError If test results are required but not initialized in constructor. """ f = open(filename, 'a+') f.write("Method: {}".format(self.method)) f.write("\nParameters: ") for k, v in self.params.items(): f.write(str(k) + ": " + str(v) + ", ") # Get the appropriate train or test predictions if results == 'train': scores = self.train_scores f.write("\nTrain predictions | Train labels ") elif results == 'test': if self.test_scores is not None: scores = self.test_scores f.write("\nTest predictions | Test labels ") else: raise ValueError('Test scores not initialized!') else: if self.test_scores is not None: scores = self.test_scores f.write("\nTest predictions | Test labels ") else: scores = self.train_scores f.write("\nTrain predictions | Train labels ") for i in range(len(scores.y_true)): f.write("\n {} {}".format(scores.y_pred[i].item(), scores.y_true[i].item())) f.write("\n") f.close()
[docs]class NCScores(object): """ Class that encapsulates train or test predictions and exposes methods to compute different performance metrics. Supports multi-label classification. Parameters ---------- y_true : ndarray An array containing the true labels. y_pred : ndarray An array containing the predictions. Attributes ---------- y_true : ndarray An array containing the true labels. y_pred : ndarray An array containing the predictions. """ def __init__(self, y_true, y_pred): self.y_true = np.array(y_true) self.y_pred = np.array(y_pred) self._sorted = sorted(zip(self.y_true, self.y_pred), key=lambda x: x[1], reverse=True)
[docs] def f1_micro(self): """ Computes the f1 score globally for all labels (i.e. sums the tp for all classes and divides by the sum of all tp+fp). Returns ------- f1_micro : float The f1 micro score. """ return f1_score(self.y_true, self.y_pred, average='micro')
[docs] def f1_macro(self): """ Computes the f1 score for each label, and finds their unweighted average. This metric does not take label imbalance into account. Returns ------- f1_macro : float The f1 macro score. """ return f1_score(self.y_true, self.y_pred, average='macro')
[docs] def f1_weighted(self): """ Computes the f1 score for each label, and finds their average, weighted by support (the number of true instances for each label). Returns ------- f1_weighted : float The weighted f1 score. """ return f1_score(self.y_true, self.y_pred, average='weighted')