Source code for evalne.evaluation.score

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018

# This file contains methods and classes that simplify the management and storage of evaluation results, both for
# individual methods as well as complete evaluations.

import os
import pickle
import warnings
import numpy as np
import pandas as pd

from collections import Counter
from collections import OrderedDict
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.linear_model import LogisticRegression

from evalne.utils import viz_utils as viz


[docs]class Scoresheet:
    """
    Class that simplifies the logging and management of evaluation results and execution times. Functions for logging,
    plotting and writing the results to files are provided. The Scoresheet does not log the complete train or test
    model predictions.

    Parameters
    ----------
    tr_te : string, optional
        A string indicating if the 'train' or 'test' results should be stored. Default is 'test'.
    precatk_vals : list of int or None, optional
        The values for which the precision at k should be computed. Default is None.
    """

    def __init__(self, tr_te='test', precatk_vals=None):
        self._tr_te = tr_te
        self._precatk_vals = precatk_vals
        self._scoresheet = OrderedDict()
        self._all_methods = OrderedDict()

[docs]    def log_results(self, results):
        """
        Logs in the Scoresheet all the performance metrics (and execution time) extracted from the input Results object
        or list of Results objects. Multiple Results for the same method on the same network can be provided and will
        all be stored (these are assumed to correspond to different repetitions of the experiment).

        Parameters
        ----------
        results : Results or list of Results
            The Results object or objects to be logged in the Scoresheet.

        Examples
        --------
        Evaluate the common neighbours baseline and log the train and test results:

        >>> tr_scores = Scoresheet(tr_te='train')
        >>> te_scores = Scoresheet(tr_te='test')
        >>> result = nee.evaluate_baseline(method='common_neighbours')
        >>> tr_scores.log_results(result)
        >>> te_scores.log_results(result)

        """
        if isinstance(results, Results):
            self._log_result(results)
        else:
            for res in results:
                self._log_result(res)

    def _log_result(self, result):
        """
        Logs in the Scoresheet all the performance metrics (and execution time) extracted from the input Results object.

        Parameters
        ----------
        result : Results
            The Results object to be logged in the Scoresheet.
        """
        # Get the dictionary keys
        k1 = result.params['nw_name']       # First key is network name
        k2 = result.method                  # Second key is method name
        self._all_methods[k2] = 0

        # Store the results
        if k1 in self._scoresheet:
            # Dataset exists in the dictionary, so we extend it
            if k2 in self._scoresheet[k1]:
                # Method exists in the dictionary, so we extend its metrics with vals of new exp repeat
                metrics, vals = result.get_all(self._tr_te, self._precatk_vals)
                for i in range(len(metrics)):
                    self._scoresheet[k1][k2][metrics[i]].append(np.around(vals[i], 4))
                self._scoresheet[k1][k2]['eval_time'].append(result.params['eval_time'])
                self._scoresheet[k1][k2]['edge_embed_method'].append(result.params.get('edge_embed_method', 'None'))

            else:
                # Method is not yet in the dict, so we add method and metrics
                metrics, vals = result.get_all(self._tr_te, self._precatk_vals)
                self._scoresheet[k1][k2] = OrderedDict(zip(metrics, map(lambda x: [np.around(x, 4)], vals)))
                self._scoresheet[k1][k2]['eval_time'] = [result.params['eval_time']]
                self._scoresheet[k1][k2]['edge_embed_method'] = [result.params.get('edge_embed_method', 'None')]

        else:
            # Dataset is not yet in the dict, so we add dataset, method and metrics
            metrics, vals = result.get_all(self._tr_te, self._precatk_vals)
            aux = OrderedDict(zip(metrics, map(lambda x: [np.around(x, 4)], vals)))
            self._scoresheet[k1] = OrderedDict({k2: aux})
            self._scoresheet[k1][k2]['eval_time'] = [result.params['eval_time']]
            self._scoresheet[k1][k2]['edge_embed_method'] = [result.params.get('edge_embed_method', 'None')]

[docs]    def get_pandas_df(self, metric='auroc', repeat=None):
        """
        Returns a view of the Scoresheet as a pandas DataFrame for the specified metric. The columns of the DataFrame
        represent different networks and the rows different methods. If multiple Results for the same network/method
        combination were logged (multiple repetitions of the experiment), one can select any of these repeats or get
        the average over all.

        Parameters
        ----------
        metric : string, optional
            Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout',
            'miss', 'accuracy', 'f_score', 'eval_time' or 'edge_embed_method'. Default is 'auroc'.
        repeat : int, optional
            An int indicating the experiment repeat for which the results should be returned. If not indicated, the
            average over all repeats will be computed and returned. Default is None (computes average over repeats).

        Returns
        -------
        df : DataFrame
            A pandas DataFrame view of the Scoresheet for the specified metric.

        Raises
        ------
        ValueError
            If the requested metric does not exist.
            If the Scoresheet is empty so a DataFrame can not be generated.

        Notes
        -----
        For uncountable 'metrics' such as the node pair embedding operator (i.e 'edge_embed_method'), avg returns the
        most frequent item in the vector.

        Examples
        --------
        Read a scoresheet and get the auroc scores as a pandas DataFrame

        >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb'))
        >>> df = scores.get_pandas_df()
        >>> df
                            Network_1 Network_2
        katz                   0.8203    0.8288
        common_neighbours      0.3787    0.3841
        jaccard_coefficient    0.3787    0.3841

        Read a scoresheet and get the f scores of the first repetition of the experiment

        >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb'))
        >>> df = scores.get_pandas_df('f_score', repeat=0)
        >>> df
                            Network_1 Network_2
        katz                        0         0
        common_neighbours      0.7272    0.7276
        jaccard_coefficient    0.7265    0.7268

        """
        if len(self._scoresheet) != 0:
            nw = next(iter(self._scoresheet))
            if metric not in iter(self._scoresheet[nw][next(iter(self._scoresheet[nw]))].keys()):
                raise ValueError('Requested metric `{}` does not exist!'.format(metric))
        else:
            raise ValueError('Scoresheet is empty, can not generate pandas df! Try logging some results first.')

        cols = self._scoresheet.keys()
        rows = list(self._all_methods)
        df = pd.DataFrame(index=rows, columns=cols)
        for k1 in cols:
            for k2 in rows:
                d = self._scoresheet[k1].get(k2)
                if d is not None:
                    if repeat is None:
                        if metric == 'edge_embed_method':
                            count = Counter(d.get(metric))
                            df[k1][k2] = count.most_common(1)[0][0]
                        else:
                            df[k1][k2] = np.around(np.mean(np.array(d.get(metric))), 4)
                    else:
                        arr = d.get(metric)
                        if len(arr) >= repeat+1:
                            df[k1][k2] = d.get(metric)[repeat]
                        else:
                            df[k1][k2] = None
        return df

[docs]    def get_latex(self, metric='auroc'):
        """
        Returns a view of the Scoresheet as a Latex table for the specified metric. The columns of the table
        represent different networks and the rows different methods. If multiple Results for the same network/method
        combination were logged (multiple repetitions of the experiment), the average is returned.

        Parameters
        ----------
        metric : string, optional
            Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout',
            'miss', 'accuracy', 'f_score', 'eval_time' or 'edge_embed_method'. Default is 'auroc'.

        Returns
        -------
        latex_table : string
            A latex table as a string.
        """
        df = self.get_pandas_df(metric)
        return df.to_latex()

[docs]    def print_tabular(self, metric='auroc'):
        """
        Prints a tabular view of the Scoresheet for the specified metric. The columns of the table represent different
        networks and the rows different methods. If multiple Results for the same network/method combination were logged
        (multiple repetitions of the experiment), the average is showed.

        Parameters
        ----------
        metric : string, optional
            Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout',
            'miss', 'accuracy', 'f_score', 'eval_time' or 'edge_embed_method'. Default is 'auroc'.

        Examples
        --------
        Read a scoresheet and get the average execution times over all experiment repeats as tabular output:

        >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb'))
        >>> scores.print_tabular('eval_time')
                            Network_1 Network_2
        katz                   0.0350    0.0355
        common_neighbours      0.0674    0.0676
        jaccard_coefficient    0.6185    0.6693

        """
        print(self.get_pandas_df(metric))

[docs]    def write_tabular(self, filename, metric='auroc'):
        """
        Writes a tabular view of the Scoresheet for the specified metric to a file. The columns of the table represent
        different networks and the rows different methods. If multiple Results for the same network/method combination
        were logged (multiple repetitions of the experiment), the average is used.

        Parameters
        ----------
        filename : string
            A file where to store the results.
        metric : string, optional
            Can be one of 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall', 'fallout',
            'miss', 'accuracy', 'f_score' or 'eval_time'. Default is 'auroc'.
        """
        header = '\n\nEvaluation results ({}):\n-----------------------\n'.format(metric)
        f = open(filename, 'a')
        f.write(header)
        df = self.get_pandas_df(metric)
        df.to_csv(f, sep='\t', na_rep='NA')
        f.close()

[docs]    def write_all(self, filename, repeats='avg'):
        """
        Writes for all networks, methods and performance metrics the corresponding values to a file. If multiple Results
        for the same network/method combination were logged (multiple repetitions of the experiment), the method can
        return the average or all logged values.

        Parameters
        ----------
        filename : string
            A file where to store the results.
        repeats : string, optional
            Can be one of 'all', 'avg'. Default is 'avg'.

        Notes
        -----
        For uncountable 'metrics' such as the node pair embedding operator (i.e 'edge_embed_method'), avg returns the
        most frequent item in the vector.

        Examples
        --------
        Read a scoresheet and write all metrics to a file with repeats='avg':

        >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb'))
        >>> scores.write_all('./test.txt')
        >>> print(open('test.txt', 'rb').read())
        Network_1 Network
        ---------------------------
        katz:
         tn:  	 684.0
         fp:  	 0.0
         fn:  	 684.0
         tp:  	 0.0
         auroc:  	 0.8203
        ...

        Read a scoresheet and write all metrics to a file with repeats='all':

        >>> scores = pickle.load(open('lp_eval_1207_1638/eval.pkl', 'rb'))
        >>> scores.write_all('./test.txt', 'all')
        >>> print(open('test.txt', 'rb').read())
        Network_1 Network
        ---------------------------
        katz:
         tn:  	 [684 684]
         fp:  	 [0 0]
         fn:  	 [684 684]
         tp:  	 [0 0]
         auroc:  	 [0.8155 0.8252]
        ...

        """
        f = open(filename, 'a+b')

        # Loop over all datasets
        for k1 in self._scoresheet:
            f.write(('\n\n{} Network'.format(k1)).encode())
            f.write('\n---------------------------'.encode())
            # Loop over all methods
            for k2 in self._scoresheet[k1]:
                f.write(('\n{}:'.format(k2)).encode())
                f.write('\n '.encode())
                # Loop over all metrics (auroc, pr, f_score...)
                for k3 in self._scoresheet[k1][k2]:
                    if repeats == 'avg':
                        # Compute average over all exp repeats for each metric
                        if k3 == 'edge_embed_method':
                            count = Counter(self._scoresheet[k1][k2][k3])
                            f.write((k3 + ':  \t ' + count.most_common(1)[0][0] + '\n ').encode())
                        else:
                            avg = np.around(np.mean(np.array(self._scoresheet[k1][k2][k3])), 4)
                            f.write((k3 + ':  \t ' + str(avg) + '\n ').encode())
                    else:
                        # Report all values for each exp repeat
                        if k3 == 'edge_embed_method':
                            vals = self._scoresheet[k1][k2][k3]
                            f.write((k3 + ':  \t ' + str(vals) + '\n ').encode())
                        else:
                            vals = np.around(np.array(self._scoresheet[k1][k2][k3]), 4)
                            f.write((k3 + ':  \t ' + str(vals) + '\n ').encode())

        # Close the file
        f.close()

[docs]    def write_pickle(self, filename):
        """
        Writes a pickle representation of this object to a file.

        Parameters
        ----------
        filename : string
            A file where to store the pickle representation.
        """
        pickle.dump(self, open(filename, "wb"))


[docs]class Results(object):
    """
    Class that encapsulates the train and test predictions of one method on a specific network and set of parameters.
    The train and test predictions are stored as Scores objects. Functions for plotting, printing and saving to files
    the train and test scores are provided. Supports binary classification only.

    Parameters
    ----------
    method : string
        A string representing the name of the method associated with these results.
    params : dict
        A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation.
    train_pred : ndarray
        An array containing the train predictions.
    train_labels : ndarray
        An array containing the train labels.
    test_pred : ndarray, optional
        An array containing the test predictions. Default is None.
    test_labels : ndarray, optional
        An array containing the test labels. Default is None.
    label_binarizer : string or Sklearn binary classifier, optional
        If the predictions returned by the model are not binary, this parameter indicates how these binary
        predictions should be computed in order to be able to provide metrics such as the confusion matrix.
        Any Sklear binary classifier can be used or the keyword 'median' which will used the prediction medians
        as binarization thresholds. Default is LogisticRegression(solver='liblinear')

    Attributes
    ----------
    method : string
        A string representing the name of the method associated with these results.
    params : dict
        A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation.
    binary_preds : bool
        A bool indicating if the train and test predictions are binary or not.
    train_scores : Scores
        A Scores object containing train scores.
    test_scores : Scores, optional
        A Scores object containing test scores. Default is None.
    label_binarizer : string or Sklearn binary classifier, optional
        If the predictions returned by the model are not binary, this parameter indicates how these binary
        predictions should be computed in order to be able to provide metrics such as the confusion matrix.
        By default, the method binarizes the predictions such that their accuracy is maximised.
        Any Sklearn binary classifier can be used or the keyword 'median' which will used the prediction medians
        as binarization thresholds. Default is LogisticRegression(solver='liblinear')

    Raises
    ------
    AttributeError
        If the label binarizer is set to an incorrect value.
    """

    def __init__(self, method, params, train_pred, train_labels, test_pred=None, test_labels=None,
                 label_binarizer=LogisticRegression(solver='liblinear')):
        self.params = params
        self.method = method
        self.label_binarizer = label_binarizer
        self.binary_preds = self._check_binary(train_pred, test_pred)
        self.train_scores = None
        self.test_scores = None
        self._init_scores(train_pred, train_labels, test_pred, test_labels)

    @staticmethod
    def _check_binary(train_pred, test_pred):
        """
        Method that checks if the train and test predictions are binary.

        Parameters
        ----------
        train_pred : ndarray
            An array containing the train predictions.
        test_pred : ndarray, optional
            An array containing the test predictions.

        Returns
        -------
        binary_preds : bool
            A bool indicating if the train and test predictions are binary or not.
        """
        if test_pred is None:
            if ((train_pred == 0) | (train_pred == 1)).all():
                return True
            else:
                return False
        else:
            if ((train_pred == 0) | (train_pred == 1)).all() and ((test_pred == 0) | (test_pred == 1)).all():
                return True
            else:
                return False

    def _init_scores(self, train_pred, train_labels, test_pred, test_labels):
        """
        Method that creates the train and test Scores objects.

        Parameters
        ----------
        train_pred : ndarray
            An array containing the train predictions.
        train_labels : ndarray
            An array containing the train labels.
        test_pred : ndarray, optional
            An array containing the test predictions.
        test_labels : ndarray, optional
            An array containing the test labels.
        """
        # Check if the predictions are binary or not
        if self.binary_preds:
            # Create the score objects
            self.train_scores = Scores(y_true=train_labels, y_pred=train_pred, y_bin=train_pred)
            if test_pred is not None:
                self.test_scores = Scores(y_true=test_labels, y_pred=test_pred, y_bin=test_pred)
        else:
            if self.label_binarizer == 'median':
                # Compute binarized predictions using the median
                th1 = np.median(train_pred)
                train_bin = np.where(train_pred >= th1, 1, 0)
                if test_pred is not None:
                    th2 = np.median(test_pred)
                    test_bin = np.where(test_pred >= th2, 1, 0)
            elif self.label_binarizer == 'prop':
                num_zeros = int(len(train_labels) - sum(train_labels))
                train_bin = np.ones(len(train_labels))
                argsrt = np.argsort(train_pred)
                train_bin[argsrt[:num_zeros]] = 0
                if test_pred is not None:
                    # To avoid label leakage we assume the test data has the same proportion of pos/neg elems as train
                    test_labels = np.array(test_labels)
                    num_zeros = int((num_zeros / len(train_labels)) * len(test_labels))
                    test_bin = np.ones(len(test_labels))
                    argsrt = np.argsort(test_pred)
                    test_bin[argsrt[:num_zeros]] = 0
            else:
                try:
                    # Compute the binarized predictions
                    self.label_binarizer.fit(train_pred.reshape(-1, 1), train_labels)
                    train_bin = self.label_binarizer.predict(train_pred.reshape(-1, 1))
                    if test_pred is not None:
                        test_bin = self.label_binarizer.predict(test_pred.reshape(-1, 1))
                except AttributeError:
                    print('The label_binarizer is set to an incorrect value! '
                          'Method predictions are not binary so a correct label_binarizer is required.')
                    raise

            # Create the score objects
            self.train_scores = Scores(y_true=train_labels, y_pred=train_pred, y_bin=train_bin)
            if test_pred is not None:
                self.test_scores = Scores(y_true=test_labels, y_pred=test_pred, y_bin=test_bin)

[docs]    def plot(self, filename=None, results='auto', curve='all'):
        """
        Plots PR or ROC curves of the train or test predictions. If a filename is provided, the method will store the
        plot in pdf format to a file named <filename>+'_PR.pdf' or <filename>+'_ROC.pdf'.

        Parameters
        ----------
        filename : string, optional
            A string indicating the path and name of the file where to store the plot. If None, the plots are only
            shown on screen. Default is None.
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).
        curve : string, optional
            Can be one of 'all', 'pr' or 'roc'. Default is 'all' (generates both curves).

        Raises
        ------
        ValueError
            If test results are requested but not initialized in constructor.
        """
        # Get the appropriate train or test scores
        if results == 'train':
            scores = self.train_scores
        elif results == 'test':
            if self.test_scores is not None:
                scores = self.test_scores
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                results = 'test'
                scores = self.test_scores
            else:
                results = 'train'
                scores = self.train_scores

        if curve == 'all' or curve == 'pr':
            precision, recall, _ = precision_recall_curve(scores.y_true, scores.y_pred)
            viz.plot_curve('{}_{}_PR.pdf'.format(filename, results), recall, precision, 'Recall', 'Precision',
                           '{} {} PR curve'.format(self.method, results))

        if curve == 'all' or curve == 'roc':
            tolerance = 0.25
            if np.sum(scores.y_true) < tolerance * len(scores.y_true) or \
                    np.sum(scores.y_true) > (1 - tolerance) * len(scores.y_true):
                warnings.warn('ROC curves are not recommended in the case of extreme class imbalance. '
                              'PR curves should be preferred.', Warning)
            fpr, tpr, thresholds = roc_curve(scores.y_true, scores.y_pred)
            viz.plot_curve('{}_{}_ROC.pdf'.format(filename, results), fpr, tpr, 'False positive rate',
                           'True positive rate', '{} {} ROC curve'.format(self.method, results))

[docs]    def save(self, filename, results='auto', precatk_vals=None):
        """
        Writes the method name, execution parameters, and all available performance metrics (for train or test
        predictions) to a file.

        Parameters
        ----------
        filename : string or file
            A file or filename where to store the output.
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).
        precatk_vals : list of int or None, optional
            The values for which the precision at k should be computed. Default is None.

        Raises
        ------
        ValueError
            If test results are required but not initialized in constructor.

        See Also
        --------
        get_all : Describes all the performance metrics that can be computed from train or test predictions.
        """
        f = open(filename, 'a+')
        f.write("Method: {}".format(self.method))
        f.write("\nParameters: ")
        for k, v in self.params.items():
            f.write(str(k) + ": " + str(v) + ", ")

        # Get the appropriate train or test scores
        if results == 'train':
            f.write("\nTrain scores: ")
        elif results == 'test':
            if self.test_scores is not None:
                f.write("\nTest scores: ")
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                f.write("\nTest scores: ")
            else:
                f.write("\nTrain scores: ")

        metric_names, metric_vals = self.get_all(results, precatk_vals)
        for i in range(len(metric_names)):
            f.write("\n {} = {}".format(metric_names[i], metric_vals[i]))
        f.write("\n\n")
        f.close()

[docs]    def pretty_print(self, results='auto', precatk_vals=None):
        """
        Prints to screen the method name, execution parameters, and all available performance metrics (for train or test
        predictions).

        Parameters
        ----------
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).
        precatk_vals : list of int or None, optional
            The values for which the precision at k should be computed. Default is None.

        Raises
        ------
        ValueError
            If test results are requested but not initialized in constructor.

        See Also
        --------
        get_all : Describes all the performance metrics that can be computed from train or test predictions.
        """
        print("Method: {}".format(self.method))
        print("Parameters: ")
        print(self.params.items())

        # Get the appropriate train or test scores
        if results == 'train':
            print("Train scores: ")
        elif results == 'test':
            if self.test_scores is not None:
                print("Test scores: ")
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                print("Test scores: ")
            else:
                print("Train scores: ")

        metric_names, metric_vals = self.get_all(results, precatk_vals)
        for i in range(len(metric_names)):
            print("{} = {}".format(metric_names[i], metric_vals[i]))
        print("")

[docs]    def get_all(self, results='auto', precatk_vals=None):
        """
        Returns the names of all performance metrics that can be computed from train or test predictions and their
        associated values. These metrics are: 'tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision',
        'precisionatk', 'recall', 'fallout', 'miss', 'accuracy' and 'f_score'.

        Parameters
        ----------
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).
        precatk_vals : list of int or None, optional
            The values for which the precision at k should be computed. Default is None.

        Raises
        ------
        ValueError
            If test results are requested but not initialized in constructor.
        """
        # Get the appropriate train or test scores
        if results == 'train':
            scores = self.train_scores
        elif results == 'test':
            if self.test_scores is not None:
                scores = self.test_scores
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                scores = self.test_scores
            else:
                scores = self.train_scores

        # Add the available scores
        metric_names = ['tn', 'fp', 'fn', 'tp', 'auroc', 'average_precision', 'precision', 'recall',
                        'fallout', 'miss', 'accuracy', 'f_score']
        metric_vals = [scores.tn, scores.fp, scores.fn, scores.tp, scores.auroc(), scores.average_precision(),
                       scores.precision(), scores.recall(), scores.fallout(), scores.miss(), scores.accuracy(),
                       scores.f_score()]

        # Add precision at k values
        if precatk_vals is not None:
            for i in precatk_vals:
                metric_names.append('prec@{}'.format(i))
                metric_vals.append(scores.precisionatk(i))

        return metric_names, metric_vals

[docs]    def save_predictions(self, filename, results='auto'):
        """
        Writes the method name, execution parameters, and the train or test predictions and corresponding labels to a
        file.

        Parameters
        ----------
        filename : string or file
            A file or filename where to store the output.
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).

        Raises
        ------
        ValueError
            If test results are required but not initialized in constructor.
        """
        f = open(filename, 'a+')
        f.write("Method: {}".format(self.method))
        f.write("\nParameters: ")
        for k, v in self.params.items():
            f.write(str(k) + ": " + str(v) + ", ")

        # Get the appropriate train or test predictions
        if results == 'train':
            scores = self.train_scores
            f.write("\nTrain predictions | Train labels ")
        elif results == 'test':
            if self.test_scores is not None:
                scores = self.test_scores
                f.write("\nTest predictions | Test labels ")
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                scores = self.test_scores
                f.write("\nTest predictions | Test labels ")
            else:
                scores = self.train_scores
                f.write("\nTrain predictions | Train labels ")

        for i in range(len(scores.y_true)):
            f.write("\n {} {}".format(scores.y_pred[i].item(), scores.y_true[i].item()))
        f.write("\n")
        f.close()


[docs]class Scores(object):
    """
    Class that encapsulates train or test predictions and exposes methods to compute different performance metrics.
    Supports binary classification only.

    Parameters
    ----------
    y_true : ndarray
        An array containing the true labels.
    y_pred : ndarray
       An array containing the predictions.
    y_bin : ndarray
        An array containing binarized predictions.

    Attributes
    ----------
    y_true : ndarray
        An array containing the true labels.
    y_pred : ndarray
       An array containing the predictions.
    y_bin : ndarray
        An array containing binarized predictions.
    tn : float
        The number of true negative in prediction.
    fp : float
        The number of false positives in prediction.
    fn : float
        The number of false negatives in prediction.
    tp : float
        The number of true positives in prediction.
    """

    def __init__(self, y_true, y_pred, y_bin):
        self.y_true = np.array(y_true)
        self.y_pred = np.array(y_pred)
        self.y_bin = np.array(y_bin)
        self._sorted = sorted(zip(self.y_true, self.y_pred), key=lambda x: x[1], reverse=True)
        self.tn, self.fp, self.fn, self.tp = confusion_matrix(self.y_true, self.y_bin).ravel()

[docs]    def precision(self):
        """
        Computes the precision in prediction.

        Returns
        -------
        precision : float
            The prediction precision score.
        """
        return self.tp / (self.tp + self.fp) if (self.tp + self.fp) != 0 else float('NaN')

[docs]    def precisionatk(self, k=100):
        """
        Computes the precision at k score.

        Parameters
        ----------
        k : int, optional
            The k value for which to compute the precision score. Default is 100.

        Returns
        -------
        precisionatk : float
            The prediction precision score for value k.
        """
        if k > len(self._sorted):
            MAX = len(self._sorted)
        else:
            MAX = k

        aux = list(zip(*self._sorted))[0]
        rel = sum(aux[:MAX])
        return (1.0 * rel) / k if k != 0 else float('NaN')

[docs]    def average_precision(self):
        """
        Computes the average precision score.

        Returns
        -------
        avgprec : float
            The average precision score.
        """
        return average_precision_score(self.y_true, self.y_pred)

[docs]    def recall(self):
        """
        Computes the recall in prediction.

        Returns
        -------
        recall : float
            The prediction recall score.
        """
        return self.tp / (self.tp + self.fn) if (self.tp + self.fn) != 0 else float('NaN')

[docs]    def fallout(self):
        """
        Computes the fallout in prediction.

        Returns
        -------
        fallout : float
            The prediction fallout score.
        """
        return self.fp / (self.fp + self.tn) if (self.fp + self.tn) != 0 else float('NaN')

[docs]    def miss(self):
        """
        Computes the miss in prediction.

        Returns
        -------
        miss : float
            The prediction miss score.
        """
        return self.fn / (self.fn + self.tn) if (self.fn + self.tn) != 0 else float('NaN')

[docs]    def accuracy(self):
        """
        Computes the accuracy score.

        Returns
        -------
        accuracy : float
            The prediction accuracy score.
        """
        return accuracy_score(self.y_true, self.y_bin)

[docs]    def f_score(self, beta=1):
        """
        Computes the F-score as the weighted harmonic mean of precision and recall.

        Parameters
        ----------
        beta : float, optional
            Allows to assign more weight to precision or recall.
            If beta > 1, recall is emphasized over precision.
            If beta < 1, precision is emphasized over recall.

        Returns
        -------
        f_score : float
            The prediction f_score.

        Notes
        -----
        The generalized form is used, where P and R represent precision and recall, respectively:

        .. math::

            F = (\\beta^2 + 1) \\cdot P \\cdot R / (\\beta^2 \\cdot P + R)

            F = (\\beta^2 + 1) \\cdot tp / ((\\beta^2 + 1) \\cdot tp + \\beta^2 \\cdot fn + fp)

        """
        beta2 = beta ** 2
        beta2_tp = (beta2 + 1) * self.tp
        den = (beta2_tp + beta2 * self.fn + self.fp)
        return beta2_tp / den if den != 0 else float('NaN')

[docs]    def auroc(self):
        """
        Computes the Area Under the Receiver Operating Characteristic Curve (ROC AUC).

        Returns
        -------
        auroc : float
            The prediction auroc score.

        Notes
        -----
        Throws a warning if class imbalance is detected.
        """
        tolerance = 0.1
        if np.sum(self.y_true) < tolerance * len(self.y_true) or \
                np.sum(self.y_true) > (1 - tolerance) * len(self.y_true):
            warnings.warn('AUROC is not recommended in the case of extreme class imbalance. ', Warning)
        return roc_auc_score(self.y_true, self.y_pred)


[docs]class NCResults(object):
    """
    Class that encapsulates the train and test predictions of one method on a specific network and set of parameters.
    The train and test predictions are stored as NCScores objects. Functions for plotting, printing and saving to files
    the train and test scores are provided. Supports multi-label classification.

    Parameters
    ----------
    method : string
        A string representing the name of the method associated with these results.
    params : dict
        A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation.
    train_pred : ndarray
        An array containing the train predictions.
    train_labels : ndarray
        An array containing the train labels.
    test_pred : ndarray, optional
        An array containing the test predictions. Default is None.
    test_labels : ndarray, optional
        An array containing the test labels. Default is None.

    Attributes
    ----------
    method : string
        A string representing the name of the method associated with these results.
    params : dict
        A dictionary of parameters used to obtain these results. Includes wall clock time of method evaluation.
    train_scores : Scores
        An NCScores object containing train scores.
    test_scores : Scores, optional
        An NCScores object containing test scores. Default is None.
    """

    def __init__(self, method, params, train_pred, train_labels, test_pred=None, test_labels=None):
        self.params = params
        self.method = method
        self.train_scores = None
        self.test_scores = None
        self._init_scores(train_pred, train_labels, test_pred, test_labels)

    def _init_scores(self, train_pred, train_labels, test_pred, test_labels):
        """
        Method that creates the train and test NCScores objects.

        Parameters
        ----------
        train_pred : ndarray
            An array containing the train predictions.
        train_labels : ndarray
            An array containing the train labels.
        test_pred : ndarray, optional
            An array containing the test predictions.
        test_labels : ndarray, optional
            An array containing the test labels.
        """
        # Create the NCScores
        self.train_scores = NCScores(y_true=train_labels, y_pred=train_pred)
        if test_pred is not None:
            self.test_scores = NCScores(y_true=test_labels, y_pred=test_pred)

[docs]    def save(self, filename, results='auto'):
        """
        Writes the method name, execution parameters, and all available performance metrics (for train or test
        predictions) to a file.

        Parameters
        ----------
        filename : string or file
            A file or filename where to store the output.
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).

        Raises
        ------
        ValueError
            If test results are required but not initialized in constructor.

        See Also
        --------
        get_all : Describes all the performance metrics that can be computed from train or test predictions.
        """
        f = open(filename, 'a+')
        f.write("Method: {}".format(self.method))
        f.write("\nParameters: ")
        for k, v in self.params.items():
            f.write(str(k) + ": " + str(v) + ", ")

        # Get the appropriate train or test scores
        if results == 'train':
            f.write("\nTrain scores: ")
        elif results == 'test':
            if self.test_scores is not None:
                f.write("\nTest scores: ")
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                f.write("\nTest scores: ")
            else:
                f.write("\nTrain scores: ")

        metric_names, metric_vals = self.get_all(results)
        for i in range(len(metric_names)):
            f.write("\n {} = {}".format(metric_names[i], metric_vals[i]))
        f.write("\n\n")
        f.close()

[docs]    def pretty_print(self, results='auto'):
        """
        Prints to screen the method name, execution parameters, and all available performance metrics (for train or test
        predictions).

        Parameters
        ----------
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).

        Raises
        ------
        ValueError
            If test results are requested but not initialized in constructor.

        See Also
        --------
        get_all : Describes all the performance metrics that can be computed from train or test predictions.
        """
        print("Method: {}".format(self.method))
        print("Parameters: ")
        print(self.params.items())

        # Get the appropriate train or test scores
        if results == 'train':
            print("Train scores: ")
        elif results == 'test':
            if self.test_scores is not None:
                print("Test scores: ")
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                print("Test scores: ")
            else:
                print("Train scores: ")

        metric_names, metric_vals = self.get_all(results)
        for i in range(len(metric_names)):
            print("{} = {}".format(metric_names[i], metric_vals[i]))
        print("")

[docs]    def get_all(self, results='auto', precatk_vals=None):
        """
        Returns the names of all performance metrics that can be computed from train or test predictions and their
        associated values. These metrics are: 'f1_micro', 'f1_macro', 'f1_weighted'.

        Parameters
        ----------
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).
        precatk_vals : None, optional
            Not used.

        Raises
        ------
        ValueError
            If test results are requested but not initialized in constructor.
        """
        # Get the appropriate train or test scores
        if results == 'train':
            scores = self.train_scores
        elif results == 'test':
            if self.test_scores is not None:
                scores = self.test_scores
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                scores = self.test_scores
            else:
                scores = self.train_scores

        # Add the available scores
        metric_names = ['f1_micro', 'f1_macro', 'f1_weighted']
        metric_vals = [scores.f1_micro(), scores.f1_macro(), scores.f1_weighted()]

        return metric_names, metric_vals

[docs]    def save_predictions(self, filename, results='auto'):
        """
        Writes the method name, execution parameters, and the train or test predictions to a file.

        Parameters
        ----------
        filename : string or file
            A file or filename where to store the output.
        results : string, optional
            A string indicating if the 'train' or 'test' predictions should be used. Default is 'auto' (selects
            'test' if test predictions are logged and 'train' otherwise).

        Raises
        ------
        ValueError
            If test results are required but not initialized in constructor.
        """
        f = open(filename, 'a+')
        f.write("Method: {}".format(self.method))
        f.write("\nParameters: ")
        for k, v in self.params.items():
            f.write(str(k) + ": " + str(v) + ", ")

        # Get the appropriate train or test predictions
        if results == 'train':
            scores = self.train_scores
            f.write("\nTrain predictions | Train labels ")
        elif results == 'test':
            if self.test_scores is not None:
                scores = self.test_scores
                f.write("\nTest predictions | Test labels ")
            else:
                raise ValueError('Test scores not initialized!')
        else:
            if self.test_scores is not None:
                scores = self.test_scores
                f.write("\nTest predictions | Test labels ")
            else:
                scores = self.train_scores
                f.write("\nTrain predictions | Train labels ")

        for i in range(len(scores.y_true)):
            f.write("\n {} {}".format(scores.y_pred[i].item(), scores.y_true[i].item()))
        f.write("\n")
        f.close()


[docs]class NCScores(object):
    """
    Class that encapsulates train or test predictions and exposes methods to compute different performance metrics.
    Supports multi-label classification.

    Parameters
    ----------
    y_true : ndarray
        An array containing the true labels.
    y_pred : ndarray
       An array containing the predictions.

    Attributes
    ----------
    y_true : ndarray
        An array containing the true labels.
    y_pred : ndarray
       An array containing the predictions.
    """

    def __init__(self, y_true, y_pred):
        self.y_true = np.array(y_true)
        self.y_pred = np.array(y_pred)
        self._sorted = sorted(zip(self.y_true, self.y_pred), key=lambda x: x[1], reverse=True)

[docs]    def f1_micro(self):
        """
        Computes the f1 score globally for all labels (i.e. sums the tp for all classes and divides by the sum of all
        tp+fp).

        Returns
        -------
        f1_micro : float
            The f1 micro score.
        """
        return f1_score(self.y_true, self.y_pred, average='micro')

[docs]    def f1_macro(self):
        """
        Computes the f1 score for each label, and finds their unweighted average. This metric does not take label
        imbalance into account.

        Returns
        -------
        f1_macro : float
            The f1 macro score.
        """
        return f1_score(self.y_true, self.y_pred, average='macro')

[docs]    def f1_weighted(self):
        """
        Computes the f1 score for each label, and finds their average, weighted by support (the number of true instances
        for each label).

        Returns
        -------
        f1_weighted : float
            The weighted f1 score.
        """
        return f1_score(self.y_true, self.y_pred, average='weighted')