Source code for evalne.utils.preprocess

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018

# This file provides simple methods to: (1) read and write graphs, embeddings, node similarities, ... to/from files and
# (2) preprocess graphs for the specific task of graph embedding. NetworkX graphs and digraphs are supported,
# multi-graphs are not.

import os
import logging
import collections
import numpy as np
import networkx as nx


##################################################
#                      I/O                       #
##################################################


[docs]def save_graph(G, output_path, delimiter=',', write_stats=True, write_weights=False, write_dir=True): """ Saves a graph to a file as an edgelist or weighted edgelist. If write_stats is True the file will include a header containing basic graph statistics as provided by the get_stats function. Parameters ---------- G : graph A NetworkX graph or digraph. output_path : file or string File or filename to write. If a file is provided, it must be opened in 'wb' mode. delimiter : string, optional The string to use for separating values in the output file. Default is ','. write_stats : bool, optional Adds basic graph statistics to the file as a header or not. Default is True. write_weights : bool, optional If True data will be stored as weighted edgelist i.e. triplets (src, dst, weight), otherwise, as regular (src, dst) pairs. For unweighted graphs, setting this parameter to True will add weight 1 to all edges. Default is False. write_dir : bool, optional This parameter is only relevant for undirected graphs. If True, it forces the method to write both edge directions in the file i.e. (src, dst) and (dst, src). If False, only one direction is stored. Default is True. See Also -------- evalne.utils.split_train_test.get_stats : Function used to compute simple statistics to add as file header. """ # Write the graph stats in the file if required if write_stats: get_stats(G, output_path) # Open the file where data should be stored f = open(output_path, 'a+b') # Write the graph to a file and use both edge directions if graph is undirected if G.is_directed(): # Store edgelist if write_weights: J = nx.DiGraph() J.add_weighted_edges_from(G.edges.data('weight', 1)) nx.write_weighted_edgelist(J, f, delimiter=delimiter) else: nx.write_edgelist(G, f, delimiter=delimiter, data=False) else: if write_dir: H = nx.to_directed(G) J = nx.DiGraph() else: H = G J = nx.DiGraph() # Store edgelist if write_weights: J.add_weighted_edges_from(H.edges.data('weight', 1)) nx.write_weighted_edgelist(J, f, delimiter=delimiter) else: nx.write_edgelist(H, f, delimiter=delimiter, data=False)
[docs]def load_graph(input_path, delimiter=',', comments='#', directed=False, datatype=float): """ Reads a directed or undirected edgelist and returns a graph. If the edgelist is weighted the graph will maintain those weights. Nodes are read as int and weights as float. Parameters ---------- input_path : file or string File or filename to read. delimiter : string, optional The string used to separate values in the input file. Default is ','. comments : string, optional The character used to indicate the start of a comment. Default is '#'. directed : bool Indicates if the method should return a graph or a digraph. datatype : int or float, optional The type of the graph weights. Default is float. Returns ------- G : graph A NetworkX graph or digraph. """ if directed: G = nx.read_edgelist(input_path, delimiter=delimiter, comments=comments, create_using=nx.DiGraph, nodetype=int, data=(("weight", datatype),)) else: G = nx.read_edgelist(input_path, delimiter=delimiter, comments=comments, create_using=nx.Graph, nodetype=int, data=(("weight", datatype),)) if len(G.edges) == 0: raise ValueError("The graph is empty, the delimiter specified might be incorrect!") return G
[docs]def read_labels(input_path, delimiter=',', comments='#', idx_mapping=None): """ Reads node labels from a file and returns them as a numpy ndarray where the first column contains nodeIDs and the second, node attributes. If idx_mapping is provided, the original indices are re-mapped to new indices. Parameters ---------- input_path : file or string File or filename to read. File is assumed to contain in each line a nodeID and label pair. delimiter : string, optional The string used to separate values in the input file. Default is ','. comments : string, optional The character used to indicate the start of a comment. Default is '#'. idx_mapping : list of tuples A list of (OldNodeID, NewNodeID). Returns ------- labels : ndarray A column vector of nodeID and label pairs. """ # Read the lables from a file labels = np.loadtxt(input_path, delimiter=delimiter, comments=comments, dtype=float) # Remap them if necessary if idx_mapping is not None: res = list() # Make labels a dict lab_d = dict(labels) for (oldnid, newnid) in idx_mapping: res.append((newnid, lab_d.get(oldnid))) return np.array(res) else: return labels[labels[:, 0]]
[docs]def read_node_embeddings(input_path, nodes, embed_dim, delimiter=',', method_name=None): """ Reads node embeddings from a file (see Notes for expected input file structure) and returns the results as a dictionary. Keys correspond to nodeIDs and values to vectors. File headers are inferred base on the expected number of embeddings and ignored. Parameters ---------- input_path : file or string File or filename to read. nodes : array_like The network nodes for which embeddings are expected. embed_dim : int The expected dimensionality of the node embeddings. delimiter : string, optional The string used to separate values in the input file. Default is ','. method_name : string, optional A string indicating the name of the method being evaluated. If provided will be used when logging an error. Default is None. Returns ------- X : dict A dictionary of {`nodeID`: embed_vect, `nodeID`: embed_vect, ...}. Keys are type string and values array_like. Raises ------ ValueError If the number of embeddings in the file is less than len(nodes). If the input data dimensions are not correct. Notes ----- The method accepts files where each line corresponds to the embedding of one node. The nodeID can be present as the first value in each line. If the nodeID is not present, embeddings are assumed to be written in the file in ascending nodeID order. """ # Check how many rows are in the header if method_name is not None: emb_skiprows = infer_header(input_path, len(nodes), method_name) else: emb_skiprows = infer_header(input_path, len(nodes)) # Read the embeddings X = np.genfromtxt(input_path, delimiter=delimiter, dtype=float, skip_header=emb_skiprows, autostrip=True) # If output is just a vector rise error if X.ndim == 1: raise ValueError('Error encountered while reading node embeddings. Check output delimiter of the method.') # If output is a matrix check its dimensions if X.shape[1] == embed_dim: # Assume embeddings given as matrix [X_0, X_1, ..., X_D] where rows correspond to sorted node id keys = map(str, sorted(nodes)) X = dict(zip(keys, X)) elif X.shape[1] == embed_dim + 1: # Assume first col is node id and rest are embedding features [id, X_0, X_1, ..., X_D] if method_name is not None: logging.warning('Output provided by method `{}` contains {} columns, {} expected! ' 'Taking first column as nodeID...'.format(method_name, X.shape[1], embed_dim)) else: logging.warning('Output of evaluated method contains {} columns, {} expected! ' 'Taking first column as nodeID...'.format(X.shape[1], embed_dim)) keys = map(str, np.array(X[:, 0], dtype=int)) X = dict(zip(keys, X[:, 1:])) else: raise ValueError('Incorrect embedding dimension for the evaluated method! Expected: {} or {} Received: {}' .format(embed_dim, embed_dim + 1, X.shape[1])) # Return the embeddings return X
[docs]def read_edge_embeddings(input_path, ebunch_len, embed_dim, delimiter=',', method_name=None): """ Reads node-pair or edge embeddings from a file and returns the results as a matrix. Each line in the file is assumed to represent the embedding of one node-pair. File headers are inferred base on the expected number of embeddings and ignored. Parameters ---------- input_path : file or string File or filename to read. ebunch_len : int The number of embeddings expected. embed_dim : int The expected dimensionality of the embeddings. delimiter : string, optional The string used to separate values in the input file. Default is ','. method_name : string, optional A string indicating the name of the method being evaluated. If provided will be used when logging an error. Default is None. Returns ------- Y : ndarray A column vector containing embeddings as rows. Raises ------ ValueError If the number of embeddings in the file is less than ebunch_len. If the input data dimensions are not correct. """ # Check how many rows are in the header if method_name is not None: emb_skiprows = infer_header(input_path, ebunch_len, method_name) else: emb_skiprows = infer_header(input_path, ebunch_len) # Read the embeddings Y = np.genfromtxt(input_path, delimiter=delimiter, dtype=float, skip_header=emb_skiprows, autostrip=True) # Check embedding dimensions if Y.ndim == 1 and len(Y) != embed_dim or Y.ndim == 2 and Y.shape[1] != embed_dim: raise ValueError('Incorrect edge embedding dimension for method {}! Expected dims: ({},{}), Received dims: {}' .format(method_name, ebunch_len, embed_dim, Y.shape)) # Return the embeddings return Y
[docs]def read_predictions(input_path, ebunch_len, delimiter=',', method_name=None): """ Reads node similarities from a file (see Notes for expected input file structure) and returns the results as a vector. File headers are inferred base on the expected number of predictions and ignored. Parameters ---------- input_path : file or string File or filename to read. ebunch_len : int The number of predictions expected. delimiter : string, optional The string used to separate values in the input file. Default is ','. method_name : string, optional A string indicating the name of the method being evaluated. If provided will be used when logging an error. Default is None. Returns ------- Y : ndarray A column vector containing node similarities. Raises ------ ValueError If the number of predictions in the file is less than ebunch_len. If the input data dimensions are not correct. Notes ----- The method accepts files where all similarities are given in a single line in the file or one per line. In both case, the expected length of these "vectors" is ebunch_len. For input files containing similarities in rows, the method expects the last element of each row to be the similarity. This is useful for methods that return triplets: src, dst, sim. """ # Predictions could be returned as col or row vectors try: # Check how many rows are in the header if method_name is not None: emb_skiprows = infer_header(input_path, ebunch_len, method_name) else: emb_skiprows = infer_header(input_path, ebunch_len) except ValueError: # Predictions could have been returned as a single vector if method_name is not None: emb_skiprows = infer_header(input_path, 1, method_name) else: emb_skiprows = infer_header(input_path, 1) # Read the embeddings Y = np.genfromtxt(input_path, delimiter=delimiter, dtype=float, skip_header=emb_skiprows, autostrip=True) # Check embedding dimensions if Y.ndim == 0: if ebunch_len == 1: Y = [Y.item()] else: if method_name is not None: raise ValueError('Incorrect number of predictions for method `{}`! Expected: {}, Received: {}' .format(method_name, ebunch_len, 1)) else: raise ValueError('Incorrect number of predictions for evaluated method! Expected: {}, Received: {}' .format(method_name, ebunch_len, 1)) if Y.ndim == 1 and len(Y) != ebunch_len or Y.ndim == 2 and Y.shape[0] != ebunch_len: if method_name is not None: raise ValueError('Incorrect number of predictions for method `{}`! Expected: {}, Received: {}' .format(method_name, ebunch_len, Y.shape)) else: raise ValueError('Incorrect predictions dimension for evaluated method! Expected: {}, Received: {}' .format(ebunch_len, Y.shape)) if Y.ndim == 2: if method_name is not None: logging.warning('Output provided by method `{}` is two dimensional. Taking last column as predictions... ' .format(method_name)) else: logging.warning('Output of evaluated method is two dimensional. Taking last column as predictions... ') return Y[:, -1] else: return Y
[docs]def read_train_test(input_path, split): """ Reads the sets of train and test edges and non-edges from the given path that share the given split ID. The method assumes that these files follow the naming convention of store_train_test_splits(). Parameters ---------- input_path : string The path where the input data can be found. split : int The ID of the splits to be read. Returns ------- train_E : set Set of train edges train_E_false : set Set of train non-edges test_E : set Set of test edges test_E_false : set Set of test non-edges See Also -------- evalne.utils.split_train_test.store_train_test_splits : The files in the provided input path are expected to follow the naming convention of this function. """ # Generate file names filenames = (os.path.join(input_path, "trE_{}.csv".format(split)), os.path.join(input_path, "negTrE_{}.csv".format(split)), os.path.join(input_path, "teE_{}.csv".format(split)), os.path.join(input_path, "negTeE_{}.csv".format(split))) # Read the data train_E = np.loadtxt(filenames[0], delimiter=',', dtype=int) train_E_false = np.loadtxt(filenames[1], delimiter=',', dtype=int) test_E = np.loadtxt(filenames[2], delimiter=',', dtype=int) test_E_false = np.loadtxt(filenames[3], delimiter=',', dtype=int) return train_E, train_E_false, test_E, test_E_false
################################################## # Preprocess # ##################################################
[docs]def prep_graph(G, relabel=True, del_self_loops=True, maincc=True): """ Preprocess a graph according to the input parameters. Parameters ---------- G : graph A NetworkX graph or digraph. relabel : bool, optional Determines if the nodes should be relabeled with consecutive integers 0...N. Default is True. del_self_loops : bool, optional Determines if self loops should be deleted. Default is True. maincc : bool, optional Determines if the graphs (digraphs) should be restricted to the main (weakly) connected component or not. Default is True. Returns ------- G : graph A preprocessed NetworkX graph or digraph. Ids : list of tuples A list of (OldNodeID, NewNodeID). Returns None if relabel=False. Notes ----- For some methods trying to embed graphs with several connected components might result in inconsistent behaviours. This is the main reason for setting maincc=True. """ # Remove self loops if del_self_loops: G.remove_edges_from(nx.selfloop_edges(G)) # Restrict graph to its main connected component if maincc: if G.is_directed(): Gcc = G.subgraph(max(nx.weakly_connected_components(G), key=len)).copy() else: Gcc = G.subgraph(max(nx.connected_components(G), key=len)).copy() else: Gcc = G # Relabel graph nodes in 0...N if relabel: Grl = nx.convert_node_labels_to_integers(Gcc, first_label=0, ordering='sorted') # A list of (oldNodeID, newNodeID) ids = list(zip(sorted(Gcc.nodes), sorted(Grl.nodes))) return Grl, ids else: return Gcc, None
[docs]def relabel_nodes(train_E, test_E, directed): """ For given sets of train and test edges, the method returns relabeled sets with nodes being integers in 0...N. Additionally, the method returns a graph containing all edges in the train and test and nodes in 0...N. Parameters ---------- train_E : set The set of train edges. test_E : set The set of test edges. directed: bool Indicates if the method should return a graph or a digraph. Returns ------- train_E : set The set of train edges test_E : set The set of test edges G : graph A NetworkX graph or digraph with relabeled nodes containing the edges in train and test. mapping : dict A dictionary containing old node id's as key and new id's as values. """ # Create the set of all edges E = train_E | test_E # Create a graph if directed: H = nx.DiGraph() H.add_edges_from(E) else: H = nx.Graph() H.add_edges_from(E) # Save the mapping and relabel the graph mapping = dict(zip(H.nodes(), range(len(H.nodes())))) nx.relabel_nodes(H, mapping, copy=False) # Generate the new train set tr_E = set() for (src, dst) in train_E: tr_E.add((mapping[src], mapping[dst])) # Generate the new test set te_E = set() for (src, dst) in test_E: te_E.add((mapping[src], mapping[dst])) # Return the results return tr_E, te_E, H, mapping
[docs]def prune_nodes(G, threshold): """ Removes all nodes from the graph whose degree is strictly smaller that the threshold. This can result in a disconnected graph. Parameters ---------- G : graph A NetworkX graph or digraph. threshold : int All nodes with degree lower than this value will be removed from the graph. Returns ------- H : graph A copy of the original graph or digraph with removed nodes. """ # Create a copy of the original graph H = G.copy() # Loop over the node, degree pairs and remove the corresponding ones for node, deg in G.degree: if deg < threshold: H.remove_node(node) # Return the resulting graph return H
[docs]def get_redges_false(G, output_path=None): """ For directed graphs returns a list of all non-edges for which the opposite edge exists in the graph. E.g. returns all pairs of non-edges (a -> b) such that (b -> a) exists in the graph. Parameters ---------- G : graph A NetworkX digraph. output_path : string A path or file where to store the results. Returns ------- redges_false : set A set of node pairs respecting the mentioned property. """ if not G.is_directed(): raise ValueError("Function only defined for directed graphs.") # Get graph edges as a set E E = set(G.edges) # Reverse all edges redges_false = set(tuple(reversed(edge_tuple)) for edge_tuple in E) # Keep only the reversed edges which are not real edges redges_false = redges_false - E # Save the data if a path is provided if output_path is not None: redges_false_np = np.array([list(edge_tuple) for edge_tuple in redges_false]) np.savetxt(output_path, redges_false_np, delimiter=',', fmt='%d') # Return the sets of edges return redges_false
################################################## # Utils # ##################################################
[docs]def infer_header(input_path, expected_lines, method_name=None): """ Method that infers the length of the header of a given file from the number of lines and of expected lines. Parameters ---------- input_path : file or string File or filename to read. expected_lines : int Number of expected lines in the input file. method_name : string, optional A string indicating the name of the method being evaluated. If provided will be used when logging an error. Default is None. Returns ------- header_len : int The length of the header. Raises ------ ValueError If more lines are expected than those in the file. """ # Autodetect header of input as (num_lines_in_input - expected_lines) num_lines = sum(1 for _ in open(input_path)) header_len = num_lines - expected_lines if header_len < 0: raise ValueError('Exception, not enough lines in input file! Expected {} lines, obtained {}.' .format(expected_lines, num_lines)) elif header_len > 0: if method_name is not None: logging.warning('Output of method `{}` contains {} more lines than expected. Will consider them part ' 'of the header and ignore them... Expected num_lines {}, obtained lines {}.' .format(method_name, header_len, expected_lines, num_lines)) else: logging.warning('Output contains {} more lines than expected. Will consider them part ' 'of the header and ignore them... Expected num_lines {}, obtained lines {}.' .format(header_len, expected_lines, num_lines)) # Return the header length return header_len
[docs]def get_stats(G, output_path=None, all_stats=False): """ Prints or stores some basic statistics about the graph. If an output path is provided the results are written in said file. Parameters ---------- G : graph A NetworkX graph or digraph. output_path : file or string, optional File or filename to write. Default is None all_stats : bool, optional Sets if all stats or a small subset of them should be shown. Computing all stats can be very slow. Default is False. """ # Compute the number of nodes and edges of the graph N = len(G.nodes) M = len(G.edges) # Compute average degree and deg1 and deg2 num nodes degs = np.array(G.degree)[:, 1] avgdeg = sum(degs)/N counts = collections.Counter(degs) degdict = collections.OrderedDict(sorted(counts.items())) deg1 = degdict.get(1, 0) deg2 = degdict.get(2, 0) if all_stats: x = np.log(np.array(degdict.keys())) # degrees y = np.log(np.array(degdict.values())) # frequencies # the power-law coef. is the slope of a linear moder fitted to the loglog data which has closed-form solution plawcoef = np.abs(np.cov(x, y) / np.var(x))[0, 1] cc = nx.average_clustering(G) dens = nx.density(G) if G.is_directed(): diam = nx.diameter(G) if nx.is_strongly_connected(G) else float('inf') else: diam = nx.diameter(G) # Print or write to file the graph info if output_path is None: # Print some basic info about the graph if G.is_directed(): num_ccs = nx.number_weakly_connected_components(G) Gcc = G.subgraph(max(nx.weakly_connected_components(G), key=len)).copy() Ncc = len(Gcc.nodes) Mcc = len(Gcc.edges) print("Directed graph") print("Num. nodes: {}".format(N)) print("Num. edges: {}".format(M)) print("Num. weakly connected components: {}".format(num_ccs)) print("Num. nodes in largest weakly CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N)) print("Num. edges in largest weakly CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M)) else: num_ccs = nx.number_connected_components(G) Gcc = G.subgraph(max(nx.connected_components(G), key=len)).copy() Ncc = len(Gcc.nodes) Mcc = len(Gcc.edges) print("Undirected graph") print("Num. nodes: {}".format(N)) print("Num. edges: {}".format(M)) print("Num. connected components: {}".format(num_ccs)) print("Num. nodes in largest weakly CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N)) print("Num. edges in largest weakly CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M)) if all_stats: print("Clustering coefficient: {}".format(cc)) print("Diameter: {}".format(diam)) print("Density: {}".format(dens)) print("Power-law coefficient: {}".format(plawcoef)) print("Avg. node degree: {}".format(avgdeg)) print("Num. degree 1 nodes: {}".format(deg1)) print("Num. degree 2 nodes: {}".format(deg2)) print("Num. self loops: {}".format(nx.number_of_selfloops(G))) print("") else: # Write the info to the provided file f = open(output_path, 'w+b') if G.is_directed(): num_ccs = nx.number_weakly_connected_components(G) Gcc = G.subgraph(max(nx.weakly_connected_components(G), key=len)).copy() Ncc = len(Gcc.nodes) Mcc = len(Gcc.edges) f.write("# Directed graph".encode()) f.write("\n# Num. nodes: {}".format(N).encode()) f.write("\n# Num. edges: {}".format(M).encode()) f.write("\n# Num. weakly connected components: {}".format(num_ccs).encode()) f.write("\n# Num. nodes in largest weakly CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N).encode()) f.write("\n# Num. edges in largest weakly CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M).encode()) else: num_ccs = nx.number_connected_components(G) Gcc = G.subgraph(max(nx.connected_components(G), key=len)).copy() Ncc = len(Gcc.nodes) Mcc = len(Gcc.edges) f.write("# Undirected graph".encode()) f.write("\n# Num. nodes: {}".format(N).encode()) f.write("\n# Num. edges: {}".format(M).encode()) f.write("\n# Num. connected components: {}".format(num_ccs).encode()) f.write("\n# Num. nodes in largest CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N).encode()) f.write("\n# Num. edges in largest CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M).encode()) if all_stats: f.write("\n# Clustering coefficient: {}".format(cc).encode()) f.write("\n# Diameter: {}".format(diam).encode()) f.write("\n# Density: {}".format(dens).encode()) f.write("\n# Power-law coefficient: {}".format(plawcoef).encode()) f.write("\n# Avg. node degree: {}".format(avgdeg).encode()) f.write("\n# Num. degree 1 nodes: {}".format(deg1).encode()) f.write("\n# Num. degree 2 nodes: {}".format(deg2).encode()) f.write("\n# Num. self loops: {}".format(nx.number_of_selfloops(G)).encode()) f.write("\n".encode()) f.close()