Source code for evalne.utils.viz_utils

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 16/04/2019

# This file provides simple methods for embedding and graph visualization.
# T-SNE is applied to embeddings with more than two dimensions in order to plot them in a 2d space.

import os
import pandas as pd
import networkx as nx
import matplotlib as mpl
from sklearn.manifold import TSNE

if os.environ.get('DISPLAY', '') == '':
    mpl.use('Agg')
import matplotlib.pyplot as plt


[docs]def plot_emb2d(emb, colors=None, filename=None): """ Generates a scatter plot of the given embeddings. Optional colors for the nodes can be provided as well as a filename to store the results. If dim of embeddings > 2, uses t-SNE to reduce it to 2. Parameters ---------- emb : matrix A Numpy matrix containing the node or edge embeddings. colors : array, optional A Numpy array containing the colors of each node. Default is None. filename : string, optional A string indicating the path and name of the file where to store the scatter plot. If not provided the plot is shown on screen. Default is None. """ print('Generating embedding scatter plot...') # Get the size of the embedding n, dim = emb.shape # If needed, reduce dimensionality to 2 using t-SNE if dim > 2: print("Embedding dimension is {}, using t-SNE to reduce it to 2.".format(dim)) emb = TSNE(n_components=2).fit_transform(emb) # Plot embeddings if colors is None: plt.scatter(emb[:, 0], emb[:, 1], alpha=0.6) else: plt.scatter(emb[:, 0], emb[:, 1], alpha=0.6, c=colors) # Store or show the scatter plot if filename is None: plt.show() else: plt.savefig(filename, dpi=300, format='pdf', bbox_inches='tight')
[docs]def plot_graph2d(G, emb=None, labels=None, colors=None, filename=None): """ Plots the given graph in 2d. If the embeddings of nodes are provided, they are used to place the nodes on the 2d plane. If dim of embeddings > 2, then its reduced to 2 using t-SNE. Optional labels and colors for the nodes can be provided, as well as a filename to store the results. Parameters ---------- G : graph A NetworkX graph or digraph. emb : matrix, optional A Numpy matrix containing the node embeddings. Default is None. labels : dict, optional A dictionary containing nodeIDs as keys and node labels as values. Default is None. colors : array, optional A Numpy array containing the colors of each graph node. Default is None. filename : string, optional A string indicating the path and name of the file where to store the scatter plot. If not provided the plot is showed on screen. Default is None. """ print('Generating embedding visualization...') if emb is not None: # Get the size of the embedding n, dim = emb.shape # If needed, reduce dimensionality to 2 using t-SNE if dim > 2: print("Embedding dimension is {}, using t-SNE to reduce it to 2.".format(dim)) emb = TSNE(n_components=2).fit_transform(emb) else: # If no embeddings provided, use the spring layout to position nodes emb = nx.spring_layout(G) # Plot nodes and edges nx.draw_networkx_nodes(G, emb, node_size=100, alpha=0.6, node_color=colors) nx.draw_networkx_edges(G, emb, width=1.0, arrows=False, alpha=0.1) # Plot the labels if provided if labels is not None: nx.draw_networkx_labels(G, emb, labels=labels, font_size=6) # Store or show the scatter plot if filename is None: plt.show() else: plt.savefig(filename, dpi=300, format='pdf', bbox_inches='tight')
[docs]def plot_curve(filename, x, y, x_label, y_label, title=None): """ Plots y coordinates against x coordinates as a line. Parameters ---------- filename : string A file or filename where to store the plot. x : array_like The x coordinates of the plot. y : array_like The y coordinates of the plot. x_label : string The label of the x axis. y_label : string The label of the y axis. title : string or None, optional The title of the plot. Default is None (no title). """ plt.plot(x, y) plt.xlabel(x_label) plt.ylabel(y_label) plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) if title is not None: plt.title(title) if filename is not None: plt.savefig(filename) plt.close() else: plt.show()
[docs]def parallel_coord(scoresheet, features, class_col='methods'): """ Generates a parallel coordinate plot from the given Scoresheet object and the set of features specified. Parameters ---------- scoresheet : evalne.Scoresheet A Scoresheet object containing the results of an evaluation. features : list A list of strings indicating the features to show in the plot (in addition to methods and networks). Accepted features are: 'auroc', 'average_precision', 'precision', 'recall', 'fallout', 'miss', 'accuracy', 'f_score', `eval_time` and `edge_embed_method`. class_col : string, optional Indicates the class to highlight. Options are `methods` and `networks`. Default is `methods`. """ # Get dfs per feature and stack them f_dfs = [] for f in features: f_dfs.append(scoresheet.get_pandas_df(metric=f).stack()) # Concatenate dfs and reset indexing df = pd.concat(f_dfs, axis=1, join="inner") df.reset_index(inplace=True) # Set correct column names new_names = ['methods_str', 'networks_str'] new_names.extend(features) df.set_axis(new_names, axis=1, inplace=True) # Make networks and methods numerical df['methods_str'] = pd.Categorical(df['methods_str']) df['methods'] = df['methods_str'].cat.codes df['methods'] = (df['methods'] - df['methods'].min()) / (df['methods'].max() - df['methods'].min()) df['networks_str'] = pd.Categorical(df['networks_str']) df['networks'] = df['networks_str'].cat.codes df['networks'] = (df['networks'] - df['networks'].min()) / (df['networks'].max() - df['networks'].min()) if 'edge_embed_method' in features: df['edge_embed_method'] = pd.Categorical(df['edge_embed_method']) df['edge_embed_method'] = df['edge_embed_method'].cat.codes # TODO: fix this df['edge_embed_method'] = (df['edge_embed_method'] - df['edge_embed_method'].min()) / \ (df['edge_embed_method'].max() - df['edge_embed_method'].min()) if 'eval_time' in features: df['eval_time'] = (df['eval_time'] - df['eval_time'].min()) / \ (df['eval_time'].max() - df['eval_time'].min()) # Select all numerical cols num = ['methods', 'networks'] num.extend(features) # Generate the plot pd.plotting.parallel_coordinates(df[num], class_col) ax = plt.gca() # Add labels if class_col == 'methods': for i, (label, val) in df.ix[:, ['networks_str', 'networks']].drop_duplicates().iterrows(): ax.annotate(label, xy=(0, val), ha='left', va='center') aux = df.ix[:, ['methods_str', 'methods']].drop_duplicates() plt.legend(aux['methods_str']) elif class_col == 'networks': for i, (label, val) in df.ix[:, ['methods_str', 'methods']].drop_duplicates().iterrows(): ax.annotate(label, xy=(0, val), ha='left', va='center') aux = df.ix[:, ['networks_str', 'networks']].drop_duplicates() plt.legend(aux['networks_str']) # Some changes to plot axis ax.yaxis.set_label_position("right") ax.yaxis.tick_right() plt.show()