#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018
# This file provides simple methods to: (1) read and write graphs, embeddings, node similarities, ... to/from files and
# (2) preprocess graphs for the specific task of graph embedding. NetworkX graphs and digraphs are supported,
# multi-graphs are not.
import os
import logging
import collections
import numpy as np
import networkx as nx
##################################################
# I/O #
##################################################
[docs]def save_graph(G, output_path, delimiter=',', write_stats=True, write_weights=False, write_dir=True):
"""
Saves a graph to a file as an edgelist or weighted edgelist. If write_stats is True the file will include a header
containing basic graph statistics as provided by the get_stats function.
Parameters
----------
G : graph
A NetworkX graph or digraph.
output_path : file or string
File or filename to write. If a file is provided, it must be opened in 'wb' mode.
delimiter : string, optional
The string to use for separating values in the output file. Default is ','.
write_stats : bool, optional
Adds basic graph statistics to the file as a header or not. Default is True.
write_weights : bool, optional
If True data will be stored as weighted edgelist i.e. triplets (src, dst, weight), otherwise, as regular
(src, dst) pairs. For unweighted graphs, setting this parameter to True will add weight 1 to all edges.
Default is False.
write_dir : bool, optional
This parameter is only relevant for undirected graphs. If True, it forces the method to write both edge
directions in the file i.e. (src, dst) and (dst, src). If False, only one direction is stored. Default is True.
See Also
--------
evalne.utils.split_train_test.get_stats : Function used to compute simple statistics to add as file header.
"""
# Write the graph stats in the file if required
if write_stats:
get_stats(G, output_path)
# Open the file where data should be stored
f = open(output_path, 'a+b')
# Write the graph to a file and use both edge directions if graph is undirected
if G.is_directed():
# Store edgelist
if write_weights:
J = nx.DiGraph()
J.add_weighted_edges_from(G.edges.data('weight', 1))
nx.write_weighted_edgelist(J, f, delimiter=delimiter)
else:
nx.write_edgelist(G, f, delimiter=delimiter, data=False)
else:
if write_dir:
H = nx.to_directed(G)
J = nx.DiGraph()
else:
H = G
J = nx.DiGraph()
# Store edgelist
if write_weights:
J.add_weighted_edges_from(H.edges.data('weight', 1))
nx.write_weighted_edgelist(J, f, delimiter=delimiter)
else:
nx.write_edgelist(H, f, delimiter=delimiter, data=False)
[docs]def load_graph(input_path, delimiter=',', comments='#', directed=False, datatype=float):
"""
Reads a directed or undirected edgelist and returns a graph. If the edgelist is weighted the graph will maintain
those weights. Nodes are read as int and weights as float.
Parameters
----------
input_path : file or string
File or filename to read.
delimiter : string, optional
The string used to separate values in the input file. Default is ','.
comments : string, optional
The character used to indicate the start of a comment. Default is '#'.
directed : bool
Indicates if the method should return a graph or a digraph.
datatype : int or float, optional
The type of the graph weights. Default is float.
Returns
-------
G : graph
A NetworkX graph or digraph.
"""
if directed:
G = nx.read_edgelist(input_path, delimiter=delimiter, comments=comments, create_using=nx.DiGraph,
nodetype=int, data=(("weight", datatype),))
else:
G = nx.read_edgelist(input_path, delimiter=delimiter, comments=comments, create_using=nx.Graph,
nodetype=int, data=(("weight", datatype),))
if len(G.edges) == 0:
raise ValueError("The graph is empty, the delimiter specified might be incorrect!")
return G
[docs]def read_labels(input_path, delimiter=',', comments='#', idx_mapping=None):
"""
Reads node labels from a file and returns them as a numpy ndarray where the first column contains nodeIDs and the
second, node attributes. If idx_mapping is provided, the original indices are re-mapped to new indices.
Parameters
----------
input_path : file or string
File or filename to read. File is assumed to contain in each line a nodeID and label pair.
delimiter : string, optional
The string used to separate values in the input file. Default is ','.
comments : string, optional
The character used to indicate the start of a comment. Default is '#'.
idx_mapping : list of tuples
A list of (OldNodeID, NewNodeID).
Returns
-------
labels : ndarray
A column vector of nodeID and label pairs.
"""
# Read the lables from a file
labels = np.loadtxt(input_path, delimiter=delimiter, comments=comments, dtype=float)
# Remap them if necessary
if idx_mapping is not None:
res = list()
# Make labels a dict
lab_d = dict(labels)
for (oldnid, newnid) in idx_mapping:
res.append((newnid, lab_d.get(oldnid)))
return np.array(res)
else:
return labels[labels[:, 0]]
[docs]def read_node_embeddings(input_path, nodes, embed_dim, delimiter=',', method_name=None):
"""
Reads node embeddings from a file (see Notes for expected input file structure) and returns the results as a
dictionary. Keys correspond to nodeIDs and values to vectors. File headers are inferred base on the expected
number of embeddings and ignored.
Parameters
----------
input_path : file or string
File or filename to read.
nodes : array_like
The network nodes for which embeddings are expected.
embed_dim : int
The expected dimensionality of the node embeddings.
delimiter : string, optional
The string used to separate values in the input file. Default is ','.
method_name : string, optional
A string indicating the name of the method being evaluated. If provided will be used when logging an error.
Default is None.
Returns
-------
X : dict
A dictionary of {`nodeID`: embed_vect, `nodeID`: embed_vect, ...}. Keys are type string and values array_like.
Raises
------
ValueError
If the number of embeddings in the file is less than len(nodes).
If the input data dimensions are not correct.
Notes
-----
The method accepts files where each line corresponds to the embedding of one node. The nodeID can be present as
the first value in each line. If the nodeID is not present, embeddings are assumed to be written in the file in
ascending nodeID order.
"""
# Check how many rows are in the header
if method_name is not None:
emb_skiprows = infer_header(input_path, len(nodes), method_name)
else:
emb_skiprows = infer_header(input_path, len(nodes))
# Read the embeddings
X = np.genfromtxt(input_path, delimiter=delimiter, dtype=float, skip_header=emb_skiprows, autostrip=True)
# If output is just a vector rise error
if X.ndim == 1:
raise ValueError('Error encountered while reading node embeddings. Check output delimiter of the method.')
# If output is a matrix check its dimensions
if X.shape[1] == embed_dim:
# Assume embeddings given as matrix [X_0, X_1, ..., X_D] where rows correspond to sorted node id
keys = map(str, sorted(nodes))
X = dict(zip(keys, X))
elif X.shape[1] == embed_dim + 1:
# Assume first col is node id and rest are embedding features [id, X_0, X_1, ..., X_D]
if method_name is not None:
logging.warning('Output provided by method `{}` contains {} columns, {} expected! '
'Taking first column as nodeID...'.format(method_name, X.shape[1], embed_dim))
else:
logging.warning('Output of evaluated method contains {} columns, {} expected! '
'Taking first column as nodeID...'.format(X.shape[1], embed_dim))
keys = map(str, np.array(X[:, 0], dtype=int))
X = dict(zip(keys, X[:, 1:]))
else:
raise ValueError('Incorrect embedding dimension for the evaluated method! Expected: {} or {} Received: {}'
.format(embed_dim, embed_dim + 1, X.shape[1]))
# Return the embeddings
return X
[docs]def read_edge_embeddings(input_path, ebunch_len, embed_dim, delimiter=',', method_name=None):
"""
Reads node-pair or edge embeddings from a file and returns the results as a matrix. Each line in the file is
assumed to represent the embedding of one node-pair. File headers are inferred base on the expected number of
embeddings and ignored.
Parameters
----------
input_path : file or string
File or filename to read.
ebunch_len : int
The number of embeddings expected.
embed_dim : int
The expected dimensionality of the embeddings.
delimiter : string, optional
The string used to separate values in the input file. Default is ','.
method_name : string, optional
A string indicating the name of the method being evaluated. If provided will be used when logging an error.
Default is None.
Returns
-------
Y : ndarray
A column vector containing embeddings as rows.
Raises
------
ValueError
If the number of embeddings in the file is less than ebunch_len.
If the input data dimensions are not correct.
"""
# Check how many rows are in the header
if method_name is not None:
emb_skiprows = infer_header(input_path, ebunch_len, method_name)
else:
emb_skiprows = infer_header(input_path, ebunch_len)
# Read the embeddings
Y = np.genfromtxt(input_path, delimiter=delimiter, dtype=float, skip_header=emb_skiprows, autostrip=True)
# Check embedding dimensions
if Y.ndim == 1 and len(Y) != embed_dim or Y.ndim == 2 and Y.shape[1] != embed_dim:
raise ValueError('Incorrect edge embedding dimension for method {}! Expected dims: ({},{}), Received dims: {}'
.format(method_name, ebunch_len, embed_dim, Y.shape))
# Return the embeddings
return Y
[docs]def read_predictions(input_path, ebunch_len, delimiter=',', method_name=None):
"""
Reads node similarities from a file (see Notes for expected input file structure) and returns the results as a
vector. File headers are inferred base on the expected number of predictions and ignored.
Parameters
----------
input_path : file or string
File or filename to read.
ebunch_len : int
The number of predictions expected.
delimiter : string, optional
The string used to separate values in the input file. Default is ','.
method_name : string, optional
A string indicating the name of the method being evaluated. If provided will be used when logging an error.
Default is None.
Returns
-------
Y : ndarray
A column vector containing node similarities.
Raises
------
ValueError
If the number of predictions in the file is less than ebunch_len.
If the input data dimensions are not correct.
Notes
-----
The method accepts files where all similarities are given in a single line in the file or one per line. In both
case, the expected length of these "vectors" is ebunch_len. For input files containing similarities in rows, the
method expects the last element of each row to be the similarity. This is useful for methods that return triplets:
src, dst, sim.
"""
# Predictions could be returned as col or row vectors
try:
# Check how many rows are in the header
if method_name is not None:
emb_skiprows = infer_header(input_path, ebunch_len, method_name)
else:
emb_skiprows = infer_header(input_path, ebunch_len)
except ValueError:
# Predictions could have been returned as a single vector
if method_name is not None:
emb_skiprows = infer_header(input_path, 1, method_name)
else:
emb_skiprows = infer_header(input_path, 1)
# Read the embeddings
Y = np.genfromtxt(input_path, delimiter=delimiter, dtype=float, skip_header=emb_skiprows, autostrip=True)
# Check embedding dimensions
if Y.ndim == 0:
if ebunch_len == 1:
Y = [Y.item()]
else:
if method_name is not None:
raise ValueError('Incorrect number of predictions for method `{}`! Expected: {}, Received: {}'
.format(method_name, ebunch_len, 1))
else:
raise ValueError('Incorrect number of predictions for evaluated method! Expected: {}, Received: {}'
.format(method_name, ebunch_len, 1))
if Y.ndim == 1 and len(Y) != ebunch_len or Y.ndim == 2 and Y.shape[0] != ebunch_len:
if method_name is not None:
raise ValueError('Incorrect number of predictions for method `{}`! Expected: {}, Received: {}'
.format(method_name, ebunch_len, Y.shape))
else:
raise ValueError('Incorrect predictions dimension for evaluated method! Expected: {}, Received: {}'
.format(ebunch_len, Y.shape))
if Y.ndim == 2:
if method_name is not None:
logging.warning('Output provided by method `{}` is two dimensional. Taking last column as predictions... '
.format(method_name))
else:
logging.warning('Output of evaluated method is two dimensional. Taking last column as predictions... ')
return Y[:, -1]
else:
return Y
[docs]def read_train_test(input_path, split):
"""
Reads the sets of train and test edges and non-edges from the given path that share the given split ID.
The method assumes that these files follow the naming convention of store_train_test_splits().
Parameters
----------
input_path : string
The path where the input data can be found.
split : int
The ID of the splits to be read.
Returns
-------
train_E : set
Set of train edges
train_E_false : set
Set of train non-edges
test_E : set
Set of test edges
test_E_false : set
Set of test non-edges
See Also
--------
evalne.utils.split_train_test.store_train_test_splits :
The files in the provided input path are expected to follow the naming convention of this function.
"""
# Generate file names
filenames = (os.path.join(input_path, "trE_{}.csv".format(split)),
os.path.join(input_path, "negTrE_{}.csv".format(split)),
os.path.join(input_path, "teE_{}.csv".format(split)),
os.path.join(input_path, "negTeE_{}.csv".format(split)))
# Read the data
train_E = np.loadtxt(filenames[0], delimiter=',', dtype=int)
train_E_false = np.loadtxt(filenames[1], delimiter=',', dtype=int)
test_E = np.loadtxt(filenames[2], delimiter=',', dtype=int)
test_E_false = np.loadtxt(filenames[3], delimiter=',', dtype=int)
return train_E, train_E_false, test_E, test_E_false
##################################################
# Preprocess #
##################################################
[docs]def prep_graph(G, relabel=True, del_self_loops=True, maincc=True):
"""
Preprocess a graph according to the input parameters.
Parameters
----------
G : graph
A NetworkX graph or digraph.
relabel : bool, optional
Determines if the nodes should be relabeled with consecutive integers 0...N. Default is True.
del_self_loops : bool, optional
Determines if self loops should be deleted. Default is True.
maincc : bool, optional
Determines if the graphs (digraphs) should be restricted to the main (weakly) connected component or not.
Default is True.
Returns
-------
G : graph
A preprocessed NetworkX graph or digraph.
Ids : list of tuples
A list of (OldNodeID, NewNodeID). Returns None if relabel=False.
Notes
-----
For some methods trying to embed graphs with several connected components might result in inconsistent behaviours.
This is the main reason for setting maincc=True.
"""
# Remove self loops
if del_self_loops:
G.remove_edges_from(nx.selfloop_edges(G))
# Restrict graph to its main connected component
if maincc:
if G.is_directed():
Gcc = G.subgraph(max(nx.weakly_connected_components(G), key=len)).copy()
else:
Gcc = G.subgraph(max(nx.connected_components(G), key=len)).copy()
else:
Gcc = G
# Relabel graph nodes in 0...N
if relabel:
Grl = nx.convert_node_labels_to_integers(Gcc, first_label=0, ordering='sorted')
# A list of (oldNodeID, newNodeID)
ids = list(zip(sorted(Gcc.nodes), sorted(Grl.nodes)))
return Grl, ids
else:
return Gcc, None
[docs]def relabel_nodes(train_E, test_E, directed):
"""
For given sets of train and test edges, the method returns relabeled sets with nodes being integers in 0...N.
Additionally, the method returns a graph containing all edges in the train and test and nodes in 0...N.
Parameters
----------
train_E : set
The set of train edges.
test_E : set
The set of test edges.
directed: bool
Indicates if the method should return a graph or a digraph.
Returns
-------
train_E : set
The set of train edges
test_E : set
The set of test edges
G : graph
A NetworkX graph or digraph with relabeled nodes containing the edges in train and test.
mapping : dict
A dictionary containing old node id's as key and new id's as values.
"""
# Create the set of all edges
E = train_E | test_E
# Create a graph
if directed:
H = nx.DiGraph()
H.add_edges_from(E)
else:
H = nx.Graph()
H.add_edges_from(E)
# Save the mapping and relabel the graph
mapping = dict(zip(H.nodes(), range(len(H.nodes()))))
nx.relabel_nodes(H, mapping, copy=False)
# Generate the new train set
tr_E = set()
for (src, dst) in train_E:
tr_E.add((mapping[src], mapping[dst]))
# Generate the new test set
te_E = set()
for (src, dst) in test_E:
te_E.add((mapping[src], mapping[dst]))
# Return the results
return tr_E, te_E, H, mapping
[docs]def prune_nodes(G, threshold):
"""
Removes all nodes from the graph whose degree is strictly smaller that the threshold. This can result in a
disconnected graph.
Parameters
----------
G : graph
A NetworkX graph or digraph.
threshold : int
All nodes with degree lower than this value will be removed from the graph.
Returns
-------
H : graph
A copy of the original graph or digraph with removed nodes.
"""
# Create a copy of the original graph
H = G.copy()
# Loop over the node, degree pairs and remove the corresponding ones
for node, deg in G.degree:
if deg < threshold:
H.remove_node(node)
# Return the resulting graph
return H
[docs]def get_redges_false(G, output_path=None):
"""
For directed graphs returns a list of all non-edges for which the opposite edge exists in the graph.
E.g. returns all pairs of non-edges (a -> b) such that (b -> a) exists in the graph.
Parameters
----------
G : graph
A NetworkX digraph.
output_path : string
A path or file where to store the results.
Returns
-------
redges_false : set
A set of node pairs respecting the mentioned property.
"""
if not G.is_directed():
raise ValueError("Function only defined for directed graphs.")
# Get graph edges as a set E
E = set(G.edges)
# Reverse all edges
redges_false = set(tuple(reversed(edge_tuple)) for edge_tuple in E)
# Keep only the reversed edges which are not real edges
redges_false = redges_false - E
# Save the data if a path is provided
if output_path is not None:
redges_false_np = np.array([list(edge_tuple) for edge_tuple in redges_false])
np.savetxt(output_path, redges_false_np, delimiter=',', fmt='%d')
# Return the sets of edges
return redges_false
##################################################
# Utils #
##################################################
[docs]def get_stats(G, output_path=None, all_stats=False):
"""
Prints or stores some basic statistics about the graph. If an output path is provided the results are written in
said file.
Parameters
----------
G : graph
A NetworkX graph or digraph.
output_path : file or string, optional
File or filename to write. Default is None
all_stats : bool, optional
Sets if all stats or a small subset of them should be shown. Computing all stats can be very slow.
Default is False.
"""
# Compute the number of nodes and edges of the graph
N = len(G.nodes)
M = len(G.edges)
# Compute average degree and deg1 and deg2 num nodes
degs = np.array(G.degree)[:, 1]
avgdeg = sum(degs)/N
counts = collections.Counter(degs)
degdict = collections.OrderedDict(sorted(counts.items()))
deg1 = degdict.get(1, 0)
deg2 = degdict.get(2, 0)
if all_stats:
x = np.log(np.array(degdict.keys())) # degrees
y = np.log(np.array(degdict.values())) # frequencies
# the power-law coef. is the slope of a linear moder fitted to the loglog data which has closed-form solution
plawcoef = np.abs(np.cov(x, y) / np.var(x))[0, 1]
cc = nx.average_clustering(G)
dens = nx.density(G)
if G.is_directed():
diam = nx.diameter(G) if nx.is_strongly_connected(G) else float('inf')
else:
diam = nx.diameter(G)
# Print or write to file the graph info
if output_path is None:
# Print some basic info about the graph
if G.is_directed():
num_ccs = nx.number_weakly_connected_components(G)
Gcc = G.subgraph(max(nx.weakly_connected_components(G), key=len)).copy()
Ncc = len(Gcc.nodes)
Mcc = len(Gcc.edges)
print("Directed graph")
print("Num. nodes: {}".format(N))
print("Num. edges: {}".format(M))
print("Num. weakly connected components: {}".format(num_ccs))
print("Num. nodes in largest weakly CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N))
print("Num. edges in largest weakly CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M))
else:
num_ccs = nx.number_connected_components(G)
Gcc = G.subgraph(max(nx.connected_components(G), key=len)).copy()
Ncc = len(Gcc.nodes)
Mcc = len(Gcc.edges)
print("Undirected graph")
print("Num. nodes: {}".format(N))
print("Num. edges: {}".format(M))
print("Num. connected components: {}".format(num_ccs))
print("Num. nodes in largest weakly CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N))
print("Num. edges in largest weakly CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M))
if all_stats:
print("Clustering coefficient: {}".format(cc))
print("Diameter: {}".format(diam))
print("Density: {}".format(dens))
print("Power-law coefficient: {}".format(plawcoef))
print("Avg. node degree: {}".format(avgdeg))
print("Num. degree 1 nodes: {}".format(deg1))
print("Num. degree 2 nodes: {}".format(deg2))
print("Num. self loops: {}".format(nx.number_of_selfloops(G)))
print("")
else:
# Write the info to the provided file
f = open(output_path, 'w+b')
if G.is_directed():
num_ccs = nx.number_weakly_connected_components(G)
Gcc = G.subgraph(max(nx.weakly_connected_components(G), key=len)).copy()
Ncc = len(Gcc.nodes)
Mcc = len(Gcc.edges)
f.write("# Directed graph".encode())
f.write("\n# Num. nodes: {}".format(N).encode())
f.write("\n# Num. edges: {}".format(M).encode())
f.write("\n# Num. weakly connected components: {}".format(num_ccs).encode())
f.write("\n# Num. nodes in largest weakly CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N).encode())
f.write("\n# Num. edges in largest weakly CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M).encode())
else:
num_ccs = nx.number_connected_components(G)
Gcc = G.subgraph(max(nx.connected_components(G), key=len)).copy()
Ncc = len(Gcc.nodes)
Mcc = len(Gcc.edges)
f.write("# Undirected graph".encode())
f.write("\n# Num. nodes: {}".format(N).encode())
f.write("\n# Num. edges: {}".format(M).encode())
f.write("\n# Num. connected components: {}".format(num_ccs).encode())
f.write("\n# Num. nodes in largest CC: {} ({} % of total)".format(Ncc, Ncc * 100.0 / N).encode())
f.write("\n# Num. edges in largest CC: {} ({} % of total)".format(Mcc, Mcc * 100.0 / M).encode())
if all_stats:
f.write("\n# Clustering coefficient: {}".format(cc).encode())
f.write("\n# Diameter: {}".format(diam).encode())
f.write("\n# Density: {}".format(dens).encode())
f.write("\n# Power-law coefficient: {}".format(plawcoef).encode())
f.write("\n# Avg. node degree: {}".format(avgdeg).encode())
f.write("\n# Num. degree 1 nodes: {}".format(deg1).encode())
f.write("\n# Num. degree 2 nodes: {}".format(deg2).encode())
f.write("\n# Num. self loops: {}".format(nx.number_of_selfloops(G)).encode())
f.write("\n".encode())
f.close()