#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018
# This file provides a set of functions for (1) edge sampling: sampling sets of train and test edges from given input
# graphs, (2) non-edge sampling: sampling sets of train and test non-edges i.e. pairs of unconnected nodes,
# (3) node-pair sampling: randomly sampling a subset of all possible node pairs including edges and non-edges.
# Additional functions for computing a spanning tree of a given graph are provided.
import os
import random
import warnings
import numpy as np
import networkx as nx
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
from scipy.sparse import tril
from scipy.sparse import triu
from scipy.sparse.csgraph import depth_first_tree
from evalne.utils import preprocess as pp
##################################################
# Edge sampling #
##################################################
[docs]def split_train_test(G, train_frac=0.51, st_alg='wilson'):
"""
Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
spanning tree approach (see Notes). The resulting train edge set has the following properties: spans a graph
(digraph) with a single connected (weakly connected) component and the same nodes as G.
Parameters
----------
G : graph
A NetworkX graph or digraph with a single connected (weakly connected) component.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
st_alg : string, optional
The algorithm to use for generating the spanning tree constituting the backbone of the train set.
Options are: 'wilson' and 'broder'. The first option, 'wilson' is faster in most cases. Default is 'wilson'.
Returns
-------
train_E : set
The set of train edges.
test_E : set
The set of test edges.
Raises
------
ValueError
If the train_frac parameter is not in range (0, 1].
If the input graph G has more than one (weakly) connected component.
See Also
--------
evalne.utils.split_train_test.broder_alg, evalne.utils.split_train_test.wilson_alg :
The two options for the st_alg parameter.
Notes
-----
The method proceeds as follows: (1) a spanning tree of the input graph is selected uniformly at random using broder
or wilson's algorithm, (2) randomly selected edges are added to those of the spanning tree until train_frac is
reached, (3) the remaining edges, not used in previous steps, form the test set.
"""
# Sanity check to make sure the input is correct
_sanity_check(G)
if train_frac <= 0.0 or train_frac > 1.0:
raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
if train_frac == 1.0:
return set(G.edges), set()
# Create a set of all edges in G
E = set(G.edges)
if st_alg == 'broder':
# Compute a random spanning tree using broder's algorithm
train_E = broder_alg(G, E)
else:
# Compute a random spanning tree using wilson's algorithm
train_E = wilson_alg(G, E)
# Fill test edge set as all edges not in the spanning tree
test_E = E - train_E
# Compute num train edges
num_E = len(E)
num_train_E = np.ceil(train_frac * num_E)
# Check if the num edges in the spanning tree is already greater than the num train edges
num_toadd = int(num_train_E - len(train_E))
if num_toadd <= 0:
print("WARNING: In order to return a connected train set the train_frac parameter needs to be higher!")
print("In this case, the provided train set constitutes a random spanning tree of the input graph.")
print("The train_frac value used is: {}".format(len(train_E) / num_E))
print("Edges requested: train = {}, test = {}".format(num_train_E, num_E - num_train_E))
print("Edges returned: train = {}, test = {}".format(len(train_E), num_E - len(train_E)))
else:
# Add more edges to train set from test set until it has desired size
edges = set(random.sample(test_E, num_toadd))
test_E = test_E - edges
train_E = train_E | edges
# Perform some simple checks
assert E == (test_E | train_E)
assert len(E) == len(test_E) + len(train_E)
if num_toadd > 0:
assert num_train_E == len(train_E)
# Return the sets of edges
return train_E, test_E
[docs]def quick_split(G, train_frac=0.51):
"""
Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
quick split approach (see Notes). The resulting train edge set has the following properties: spans a graph
(digraph) with a single connected (weakly connected) component and the same nodes as G.
Parameters
----------
G : graph
A NetworkX graph or digraph with a single connected (weakly connected) component.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
Returns
-------
train_E : ndarray
Column vector of train edges as pairs src, dst.
test_E : ndarray
Column vector of test edges as pairs src, dst.
Raises
------
ValueError
If the train_frac parameter is not in range (0, 1].
If the input graph G has more than one (weakly) connected component.
Notes
-----
The method proceeds as follows: (1) a spanning tree of the input graph is generated using a depth first tree
approach starting at a random node, (2) randomly selected edges are added to those of the spanning tree until
train_frac is reached, (3) the remaining edges, not used in previous steps, form the test set.
"""
# Sanity check to make sure the input is correct
_sanity_check(G)
if train_frac <= 0.0 or train_frac > 1.0:
raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
if train_frac == 1.0:
return set(G.edges), set()
# Get Adj matrix
if nx.is_directed(G):
a = nx.adjacency_matrix(G, nodelist=range(len(G.nodes)))
else:
a = triu(nx.adjacency_matrix(G, nodelist=range(len(G.nodes))), k=1)
# Compute initial statistics and linear indx of nonzeros
n = a.shape[0]
num_tr_e = int(a.nnz * train_frac)
nz_lin_ind = np.ravel_multi_index(a.nonzero(), (n, n))
# Build a dft starting at a random node. If dir false returns only upper triangle
dft = depth_first_tree(a, np.random.randint(0, a.shape[0]), directed=nx.is_directed(G))
if nx.is_directed(G):
dft_lin_ind = np.ravel_multi_index(dft.nonzero(), (n, n))
else:
dft_lin_ind = np.ravel_multi_index(triu(tril(dft).T + dft, k=1).nonzero(), (n, n))
# From all nonzero indx remove those in dft. From the rest take enough to fill train quota. Rest are test
rest_lin_ind = np.setdiff1d(nz_lin_ind, dft_lin_ind)
aux = np.random.choice(rest_lin_ind, num_tr_e-len(dft_lin_ind), replace=False)
lin_tr_e = np.union1d(dft_lin_ind, aux)
lin_te_e = np.setdiff1d(rest_lin_ind, aux)
# Unravel the linear indices to obtain src, dst pairs
tr_e = np.array(np.unravel_index(np.array(lin_tr_e), (n, n))).T
te_e = np.array(np.unravel_index(np.array(lin_te_e), (n, n))).T
# Return the sets of edges
return tr_e, te_e
[docs]def naive_split_train_test(G, train_frac=0.51):
"""
Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
naive split approach (see Notes). The resulting train edge set has the following properties: spans a graph
(digraph) with a single connected (weakly connected) component and the same nodes as G.
Parameters
----------
G : graph
A NetworkX graph or digraph with a single connected (weakly connected) component.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
Returns
-------
train_E : set
The set of train edges.
test_E : set
The set of test edges.
Raises
------
ValueError
If the train_frac parameter is not in range (0, 1].
If the input graph G has more than one (weakly) connected component.
Notes
-----
The method proceeds as follows: (1) compute the number of test edges needed from train_frac and size of G.
(2) randomly remove edges from the input graph one at a time. If removing an edge causes the graph to become
disconnected, add it back, otherwise, put it in the set of test edges. (3) repeat the previous step until all
test edges are selected. (4) the remaining edges constitute the train set.
"""
# Sanity check to make sure the input is correct
_sanity_check(G)
if train_frac <= 0.0 or train_frac > 1.0:
raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
if train_frac == 1.0:
return set(G.edges), set()
# Is directed
directed = G.is_directed()
H = G.copy()
# Create a set of all edges in G
aux = np.array(H.edges)
np.random.shuffle(aux)
E = set([tuple(edge) for edge in aux])
# Compute num train edges
num_E = len(E)
num_train_E = np.ceil(train_frac * num_E)
num_test_E = num_E - num_train_E
# Initialize the sets of train and test edges
train_E = set(H.edges())
test_E = set()
# Iterate over shuffled edges, add to train/val sets
for i, edge in enumerate(E):
node1 = edge[0]
node2 = edge[1]
# If removing edge would disconnect a connected component, backtrack and move on
H.remove_edge(node1, node2)
if directed:
if nx.number_weakly_connected_components(H) > 1:
H.add_edge(node1, node2)
continue
else:
if nx.number_connected_components(H) > 1:
H.add_edge(node1, node2)
continue
# Fill test_edges
if len(test_E) < num_test_E:
test_E.add(edge)
train_E.remove(edge)
else:
break
# Perform some simple checks
assert E == (test_E | train_E)
assert len(E) == len(train_E) + len(test_E)
# Return the sets of edges
return train_E, test_E
[docs]def rand_split_train_test(G, train_frac=0.51):
"""
Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
random split approach (see Notes). The resulting train edge set has the following properties: spans a graph
(digraph) with a single connected (weakly connected) component.
Parameters
----------
G : graph
A NetworkX graph or digraph.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
Returns
-------
train_E : set
The set of train edges.
test_E : set
The set of test edges.
Raises
------
ValueError
If the train_frac parameter is not in range (0, 1].
Notes
-----
The method proceeds as follows: (1) randomly remove 1-train_frac percent of edges from the input graph.
(2) from the remaining edges compute the main connected component and these will be the train edges. (3) from the
set of removed edges, those such that both end nodes exist in the train edge set computed in the previous step,
are added to the final test set.
"""
if train_frac <= 0.0 or train_frac > 1.0:
raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
if train_frac == 1.0:
return set(G.edges), set()
# Create a set of all edges in G
E = set(G.edges)
num_E = len(E)
# Compute the potential number of train and test edges which corresponds to the fraction given
num_train_E = int(np.ceil(train_frac * num_E))
num_test_E = int(num_E - num_train_E)
# Randomly remove 1-train_frac edges from the graph and store them as potential test edges
pte_edges = set(random.sample(E, num_test_E))
# The remaining edges are potential train edges
ptr_edges = E - pte_edges
# Create a graph containing all ptr_edges and compute the mainCC
if G.is_directed():
H = nx.DiGraph()
H.add_edges_from(ptr_edges)
maincc = G.subgraph(max(nx.weakly_connected_components(H), key=len)).copy()
else:
H = nx.Graph()
H.add_edges_from(ptr_edges)
maincc = G.subgraph(max(nx.connected_components(H), key=len)).copy()
# The edges in the mainCC graph are the actual train edges
train_E = set(maincc.edges)
# Remove potential test edges for which the end nodes do not exist in the train_E
test_E = set()
for (src, dst) in pte_edges:
if src in maincc.nodes and dst in maincc.nodes:
test_E.add((src, dst))
# Return the sets of edges
return train_E, test_E
[docs]def timestamp_split(G, train_frac=0.51):
"""
Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using edge
timestamps (see Notes). The resulting train edge set has the following properties: spans a graph (digraph) with
a single connected (weakly connected) component.
Parameters
----------
G : graph
A NetworkX graph or digraph where edge weights are timestamps.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
Returns
-------
train_E : ndarray
Column vector of train edges as pairs src, dst.
test_E : ndarray
Column vector of test edges as pairs src, dst.
tg : graph
A NetworkX graph containing only the edges in the train edge set.
Raises
------
ValueError
If the train_frac parameter is not in range (0, 1].
If the input graph G has more than one (weakly) connected component.
Notes
-----
The method proceeds as follows: (1) sort all edges by timestamp. (2) randomly remove 1-train_frac percent of edges
from the input graph. (3) from the remaining edges compute the main connected component and these will be the train
edges. (4) from the set of removed edges, those such that both end nodes exist in the train edge set computed in
the previous step, are added to the final test set.
"""
# Sanity check to make sure the input is correct
_sanity_check(G)
if train_frac <= 0.0 or train_frac > 1.0:
raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
if train_frac == 1.0:
return set(G.edges), set()
# Get Adj matrix
if nx.is_directed(G):
a = nx.adjacency_matrix(G, nodelist=range(len(G.nodes)))
else:
a = triu(nx.adj_matrix(G, nodelist=range(len(G.nodes))), k=1)
# Argsort data and compute the idx where we split train from test
ordered = np.argsort(a.data)
split_idx = int(len(ordered) * train_frac) - 1
# Mask train edges and get all possible edges
mask_tr = ordered > split_idx
nz = a.nonzero()
# Use the mask to select only train and test from nz
# There will be no overlap between tr and te because nz contains only unique pairs
tr_e = np.array((nz[0][~mask_tr], nz[1][~mask_tr])).T
te_e = np.array((nz[0][mask_tr], nz[1][mask_tr])).T
# Taking the most recent edges for testing can cause train to be disconnected so make sure it isn't
tg = nx.Graph()
tg.add_edges_from(tr_e)
tg, ids = pp.prep_graph(tg, relabel=True, del_self_loops=True, maincc=True)
tr_e = np.array(tg.edges)
d = dict(ids)
te_e = set(zip(te_e[:, 0], te_e[:, 1]))
nte_e = map(lambda x: (d.get(x[0], -1), d.get(x[1], -1)), te_e)
te_e = np.array(nte_e)
# We now only keep the test edges between nodes in tg
# Remove nodes that are in test but not train
newn = np.setdiff1d(np.unique(te_e), np.unique(tr_e))
mask = np.isin(te_e, newn).sum(axis=1).astype(bool)
te_e = te_e[~mask, :]
# Return the sets of edges
return tr_e, te_e, tg
##################################################
# Non-edge sampling #
##################################################
[docs]def generate_false_edges_owa(G, train_E, test_E, num_fe_train=None, num_fe_test=None):
"""
Randomly samples pairs of unconnected nodes or non-edges from the input graph according to the open world
assumption (see Notes). Returns sets of train and test non-edges.
Parameters
----------
G : graph
A NetworkX graph or digraph.
train_E : set
The set of train edges.
test_E : set
The set of test edges.
num_fe_train : int, optional
The number of train non-edges to generate. Default is None (same number as train edges).
num_fe_test : int, optional
The number of test non-edges to generate. Default is None (same number as test edges).
Returns
-------
train_E_false : set
The set of train non-edges.
test_E_false : set
The set of test non-edges.
Raises
------
ValueError
If the input graph G has more than one (weakly) connected component.
If more non-edges than existing in the graph are required.
Notes
-----
The open world assumption considers that for generating train non-edges one has access to the train edges only.
Therefore, train non-edges could overlap with test edges. This scenario is common for evolving graphs where edges
can only appear. In this case, one has access to certain train edges present in the graph at time t. Non-edges
are then sampled using this information. At time t+1, the newly arrived test edges may have been previously
selected as train non-edges. For undirected graphs the output is sorted (smallNodeID, bigNodeID).
"""
# Sanity check to make sure the input is correct
_sanity_check(G)
# Create a set of vertices
V = set(G.nodes)
# Initialize the sizes of the non-edges
if num_fe_train is None:
num_fe_train = len(train_E)
if num_fe_test is None:
num_fe_test = len(test_E)
# Make sure the required amount of non-edges can be generated
max_nonedges = len(V) * len(V) - len(train_E)
if num_fe_train > max_nonedges:
raise ValueError('Too many train non-edges required! Max available for train+test is {}'.format(max_nonedges))
else:
if num_fe_train + num_fe_test > max_nonedges:
warnings.warn('Too many non-edges required in train+test! '
'Using maximum number of test non-edges available: {}'.format(max_nonedges - num_fe_train))
num_fe_test = max_nonedges - num_fe_train
# Create sets to store the non-edges
train_E_false = set()
test_E_false = set()
# Generate train non-edges
while len(train_E_false) < num_fe_train:
edge = tuple(random.sample(V, 2))
redge = tuple(reversed(edge))
if edge not in train_E:
if G.is_directed():
train_E_false.add(edge)
else:
if redge not in train_E:
train_E_false.add(tuple(sorted(edge)))
# Generate test non-edges
while len(test_E_false) < num_fe_test:
edge = tuple(random.sample(V, 2))
redge = tuple(reversed(edge))
if edge not in train_E and edge not in test_E and edge not in train_E_false:
if G.is_directed():
test_E_false.add(edge)
else:
if redge not in train_E and redge not in test_E and redge not in train_E_false:
test_E_false.add(tuple(sorted(edge)))
# Perform some simple check before returning the result
assert len(train_E_false) == num_fe_train
assert len(test_E_false) == num_fe_test
assert train_E_false.isdisjoint(test_E_false)
assert train_E_false.isdisjoint(train_E)
assert test_E_false.isdisjoint(train_E | test_E)
# Return the sets of non-edges
return train_E_false, test_E_false
[docs]def generate_false_edges_cwa(G, train_E, test_E, num_fe_train=None, num_fe_test=None):
"""
Randomly samples pairs of unconnected nodes or non-edges from the input graph according to the closed world
assumption (see Notes). Returns sets of train and test non-edges.
Parameters
----------
G : graph
A NetworkX graph or digraph.
train_E : set
The set of train edges.
test_E : set
The set of test edges.
num_fe_train : int, optional
The number of train non-edges to generate. Default is None (same number as train edges).
num_fe_test : int, optional
The number of test non-edges to generate. Default is None (same number as test edges).
Returns
-------
train_E_false : set
The set of train non-edges.
test_E_false : set
The set of test non-edges.
Raises
------
ValueError
If the input graph G has more than one (weakly) connected component.
If more non-edges than existing in the graph are required.
Notes
-----
The closed world assumption considers that one is certain about some node pairs being non-edges. Therefore,
train non-edges cannot overlap with wither train not test edges. This scenario is common for static graphs
where information about both the edges (positive interactions) and non-edges (negative interactions) is knows,
e.g. protein protein interaction networks.
"""
# Sanity check to make sure the input is correct
_sanity_check(G)
# Create a set of vertices
V = set(G.nodes)
# Initialize the sizes of the non-edges
if num_fe_train is None:
num_fe_train = len(train_E)
if num_fe_test is None:
num_fe_test = len(test_E)
# Make sure the required amount of non-edges can be generated
max_nonedges = len(V) * len(V) - len(G.edges)
if num_fe_train > max_nonedges:
raise ValueError(
'Too many train non-edges required! Max available for train+test is {}'.format(max_nonedges))
else:
if num_fe_train + num_fe_test > max_nonedges:
warnings.warn('Too many non-edges required in train+test! '
'Using maximum number of test non-edges available: {}'.format(max_nonedges - num_fe_train))
# num_fe_test = max_nonedges - num_fe_train
return _getall_false_edges(G, (1.0*num_fe_train)/max_nonedges)
# Create sets to store the non-edges
train_E_false = set()
test_E_false = set()
# Generate train non-edges
while len(train_E_false) < num_fe_train:
edge = tuple(random.sample(V, 2))
redge = tuple(reversed(edge))
if edge not in train_E and edge not in test_E:
if G.is_directed():
train_E_false.add(edge)
else:
if redge not in train_E and redge not in test_E:
train_E_false.add(tuple(sorted(edge)))
# Generate test non-edges
while len(test_E_false) < num_fe_test:
edge = tuple(random.sample(V, 2))
redge = tuple(reversed(edge))
if edge not in train_E and edge not in test_E and edge not in train_E_false:
if G.is_directed():
test_E_false.add(edge)
else:
if redge not in train_E and redge not in test_E and redge not in train_E_false:
test_E_false.add(tuple(sorted(edge)))
# Perform some simple check before returning the result
assert len(train_E_false) == num_fe_train
assert len(test_E_false) == num_fe_test
assert train_E_false.isdisjoint(test_E_false)
assert train_E_false.isdisjoint(train_E | test_E)
assert test_E_false.isdisjoint(train_E | test_E)
# Return the sets of non-edges
return train_E_false, test_E_false
def _getall_false_edges(G, fe_train_frac):
"""
Helper function for generating all non-edge pairs possible and splitting them into train and test sets.
Parameters
----------
G : graph
A NetworkX graph or digraph.
fe_train_frac : float
The proportion of train non-edges w.r.t. the total number of non-edges required (range (0.0, 1.0]).
Returns
-------
train_E_false : set
The set of train non-edges.
test_E_false : set
The set of test non-edges.
"""
print("Generating all non-edges and splitting them in train and test...")
train_E_false = list()
test_E_false = list()
for e in nx.non_edges(G):
r = random.uniform(0, 1)
if r <= fe_train_frac:
train_E_false.append(e)
else:
test_E_false.append(e)
# Return the sets of non-edges
return train_E_false, test_E_false
[docs]def quick_nonedges(G, train_frac=0.51, fe_ratio=1.0):
"""
Randomly samples pairs of unconnected nodes or non-edges from the input graph according to the open world
assumption. Is a more efficient implementation of `generate_false_edges_owa` which also does not require the
train and test edge sets as input. Returns sets of train and test non-edges.
Parameters
----------
G : graph
A NetworkX graph or digraph.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
fe_ratio : float, optional
The ratio of non-edges to edges to sample. For fr_ratio > 0 and < 1 less non-edges than edges will be
generated. For fe_edges > 1 more non-edges than edges will be generated. Default 1, same amounts.
Returns
-------
train_E_false : ndarray
Column vector of train non-edges as pairs src, dst.
test_E_false : ndarray
Column vector of test non-edges as pairs src, dst.
Raises
------
ValueError
If more non-edges than existing in the graph are required.
"""
# fe_ration can be any float or keyword 'prop'
a = nx.adjacency_matrix(G, nodelist=range(len(G.nodes)))
n = a.shape[0]
density = a.nnz / n ** 2
if fe_ratio == 'prop':
fe_ratio = np.floor(1.0 / density)
if not nx.is_directed(G):
num_fe = int((a.nnz/2.0) * fe_ratio)
else:
num_fe = int(a.nnz * fe_ratio)
num_fe_tr = int(train_frac * num_fe)
# Make sure we have enough non-edges
if num_fe > (n**2 - (a.nnz + n)):
raise ValueError('Too many non-edges required!')
# Get linear indexes of 1s in A
lin_indexes = np.ravel_multi_index(a.nonzero(), (n, n))
inv_indx = np.union1d(lin_indexes, np.ravel_multi_index(np.diag_indices(n), (n, n)))
# we could generate more FE than we need to make sure we find enough 0s
candidates = np.random.randint(0, n**2, size=int(num_fe/(1-density)))
# make sure there is no overlap
fe_lin_ind = np.setdiff1d(candidates, inv_indx)
while len(fe_lin_ind) < num_fe:
new_cands = np.random.randint(0, n ** 2, size=num_fe-len(fe_lin_ind))
valid_cands = np.setdiff1d(new_cands, inv_indx)
fe_lin_ind = np.union1d(fe_lin_ind, valid_cands)
fe_lin_ind = fe_lin_ind[:num_fe]
aux = np.array(np.unravel_index(fe_lin_ind, (n, n))).T
fe_tr = aux[:num_fe_tr, :]
fe_te = aux[num_fe_tr:, :]
# Return the sets of non-edges
return fe_tr, fe_te
##################################################
# Node-pair Sampling #
##################################################
def _compute_one_split(G, output_path, owa=True, train_frac=0.51, num_fe_train=None, num_fe_test=None, split_id=0):
"""
Splits the edges of the input graph in sets of train and test using the spanning tree approach and generates sets
of train and test non-edges. The four resulting sets of node pairs are written to different files.
The resulting train edge set has the following properties: spans a graph (digraph) with a single connected
(weakly connected) component and the same nodes as G.
Parameters
----------
G : graph
A NetworkX graph or digraph with a single connected (weakly connected) component.
output_path : string
Indicates the path where the output will be stored.
owa : bool, optional
Encodes the belief that the network respects or not the open world assumption. Default is True.
If owa=True, train non-edges are sampled from the train graph only and can overlap with test edges.
If owa=False, train non-edges are sampled from the full graph and cannot overlap with test edges.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
num_fe_train : int, optional
The number of train non-edges to generate. Default is None (same number as train edges).
num_fe_test : int, optional
The number of test non-edges to generate. Default is None (same number as test edges).
split_id : int, optional
A unique ID for this train/test split. Default is 0.
See also
--------
evalne.utils.split_train_test.split_train_test :
Method used to split graph edges in train and test sets.
evalne.utils.split_train_test.generate_false_edges_owa :
Method used to generate non-edges if owa=True.
evalne.utils.split_train_test.generate_false_edges_cwa :
Method used to generate non-edges if owa=False.
"""
# Generate train and test edge splits
train_E, test_E = split_train_test(G, train_frac)
# Generate the train/test false edges
if owa:
train_E_false, test_E_false = generate_false_edges_owa(G, train_E, test_E, num_fe_train, num_fe_test)
else:
train_E_false, test_E_false = generate_false_edges_cwa(G, train_E, test_E, num_fe_train, num_fe_test)
# Write the computed split to a file
store_train_test_splits(output_path, train_E, train_E_false, test_E, test_E_false, split_id)
[docs]def compute_splits_parallel(G, output_path, owa=True, train_frac=0.51, num_fe_train=None, num_fe_test=None,
num_splits=10):
"""
Computes in parallel the required number of train/test splits of input graph edges. Also generated the same number
of train and test non-edge sets. The resulting sets of node pairs are written to different files. The resulting
train edge set has the following properties: spans a graph (digraph) with a single connected (weakly connected)
component and the same nodes as G.
Parameters
----------
G : graph
A NetworkX graph or digraph with a single connected (weakly connected) component.
output_path : string
Indicates the path where data will be stored. Can include a name for all splits to share.
owa : bool, optional
Encodes the belief that the network respects or not the open world assumption. Default is True.
If owa=True, train non-edges are sampled from the train graph only and can overlap with test edges.
If owa=False, train non-edges are sampled from the full graph and cannot overlap with test edges.
train_frac : float, optional
The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
Default is 0.51.
num_fe_train : int, optional
The number of train non-edges to generate. Default is None (same number as train edges).
num_fe_test : int, optional
The number of test non-edges to generate. Default is None (same number as test edges).
num_splits : int, optional
The number of train/test splits to generate. Default is 10.
See also
--------
evalne.utils.split_train_test.split_train_test :
This method used to split graph edges in train and test.
evalne.utils.split_train_test.generate_false_edges_owa :
Method used to generate non-edges if owa=True.
evalne.utils.split_train_test.generate_false_edges_cwa :
Method used to generate non-edges if owa=False.
"""
# Compute the splits sequentially or in parallel
backend = 'multiprocessing'
path_func = delayed(_compute_one_split)
Parallel(n_jobs=num_splits, verbose=True, backend=backend)(
path_func(G, output_path, owa, train_frac, num_fe_train, num_fe_test, split) for split in range(num_splits))
[docs]def random_edge_sample(a, samp_frac=0.01, directed=False, sample_diag=False):
"""
Samples uniformly at random positions from the input adjacency matrix. The samples are returned in two sets, one
containing all edges sampled and one containing all non-edges.
Parameters
----------
a : sparse matrix
A sparse adjacency matrix.
samp_frac : float, optional
A float in range [0,1] representing the fraction of all edge pairs to sample. Default is 0.01 (1%)
directed : bool, optional
A flag indicating if the adjacency matrix should be considered directed or undirected. If undirected,
indices are obtained only from the lower triangle. Default is False.
sample_diag : bool, optional
If True elements from the main diagonal are also sampled (i.e. self loops). Else they are excluded from the
sampling process. Default is False.
Returns
-------
pos_e : ndarray
Sampled node pairs that are edges.
neg_e : ndarray
Sampled node pairs that are non-edges.
"""
if samp_frac > 1.0 or samp_frac < 0.0:
raise ValueError("The samp_frac parameter is expected to be in range [0,1].")
n = a.shape[0]
num_samp = int(n ** 2 * samp_frac)
# Generate a mask of sampled elements
lin_ind = np.random.choice(n ** 2, size=num_samp, replace=False)
j = np.floor(lin_ind * 1. / n).astype(int, copy=False)
i = (lin_ind - j * n).astype(int, copy=False)
v = np.ones(num_samp, dtype=bool)
mask = csr_matrix((v, (i, j)), shape=(n, n))
if directed:
if not sample_diag:
mask.setdiag(0)
mask.eliminate_zeros()
else:
# For undir graphs we only look at the upper triangle
if sample_diag:
mask = triu(mask, k=0)
else:
mask = triu(mask, k=1)
# Extract the linear indices of the nonzeros in a
lin_indx_samp = np.ravel_multi_index(mask.nonzero(), (n, n))
# All positive edges sampled in mask will stay in aux
aux = mask.multiply(a)
pos_e = np.array(aux.nonzero()).T
# The rest of the lin indx not positive are negative
lin_indx_ne = np.setdiff1d(lin_indx_samp, np.ravel_multi_index(aux.nonzero(), (n, n)))
neg_e = np.array(np.unravel_index(lin_indx_ne, (n, n))).T
# Return the node pairs
return pos_e, neg_e
##################################################
# Spanning tree computation #
##################################################
[docs]def broder_alg(G, E):
"""
Runs Andrei Broder's algorithm [1]_ to select uniformly at random a spanning tree of the input graph.
The method works for directed and undirected networks. The edge directions in the resulting spanning tree
are taken from the input edgelist (E). For node pairs where both edge directions exist, one is chosen at random.
Parameters
----------
G : graph
A NetworkX graph or digraph.
E : set
A set of all directed or undirected edges in G.
Returns
-------
train_E : set
A set of edges of G that form the random spanning tree.
See Also
--------
evalne.utils.split_train_test.wilson_alg :
A more computationally efficient approach for selecting a spanning tree.
References
----------
.. [1] A. Broder, "Generating Random Spanning Trees", Proc. of the 30th Annual Symposium
on Foundations of Computer Science, pp. 442--447, 1989.
"""
# Create two partitions, S and T. Initially store all nodes in S.
S = set(G.nodes)
T = set()
# Pick a random node as the "current node" and mark it as visited.
current_node = random.sample(S, 1).pop()
S.remove(current_node)
T.add(current_node)
# Perform random walk on the graph
train_E = set()
while S:
if G.is_directed():
neighbour_node = random.sample(list(G.successors(current_node)) + list(G.predecessors(current_node)), 1).pop()
else:
neighbour_node = random.sample(list(G.neighbors(current_node)), 1).pop()
if neighbour_node not in T:
S.remove(neighbour_node)
T.add(neighbour_node)
if random.random() < 0.5:
if (current_node, neighbour_node) in E:
train_E.add((current_node, neighbour_node))
else:
train_E.add((neighbour_node, current_node))
else:
if (neighbour_node, current_node) in E:
train_E.add((neighbour_node, current_node))
else:
train_E.add((current_node, neighbour_node))
current_node = neighbour_node
# Return the set of edges constituting the spanning tree
return train_E
[docs]def wilson_alg(G, E):
"""
Runs Willson's algorithm [2]_ also known as loop erasing random walk to select uniformly at random a spanning tree
of the input graph. The method works for directed and undirected networks [3]_. The edge directions in the resulting
spanning tree are taken from the input edgelist (E). For node pairs where both edge directions exist, one is
chosen at random.
Parameters
----------
G : graph
A NetworkX graph or digraph.
E : set
A set of all directed or undirected edges in G.
Returns
-------
train_E : set
A set of edges of G that form the random spanning tree.
See Also
--------
evalne.utils.split_train_test.broder_alg :
A different approach for selecting a spanning tree that could be faster for specific graphs.
References
----------
.. [2] D. B. Wilson, "Generating Random Spanning Trees More Quickly than the Cover Time",
In Proceedings of STOC, pp. 296--303, 1996.
.. [3] J. G. Propp and D. B. Wilson, "How to Get a Perfectly Random Sample from a Generic
Markov Chain and Generate a Random Spanning Tree of a Directed Graph",
Journal of Algorithms 27, pp. 170--217, 1998.
"""
# Stores the nodes which are part of the trees created by the LERW.
intree = set()
# A dictionary which works as a linked list and stores the spanning tree
tree = dict()
# Pick a random node as the root of the spanning tree and add it to intree
# For undirected graphs this is the correct approach
r = random.sample(G.nodes, 1).pop()
intree.add(r)
for node in G.nodes:
i = node
while i not in intree:
# This random successor works for weighted and unweighted graphs because we just
# want to select a bunch of edges from the graph, no matter what the weights are.
if G.is_directed():
tree[i] = random.sample(list(G.successors(i)) + list(G.predecessors(i)), 1).pop()
else:
tree[i] = random.sample(list(G.neighbors(i)), 1).pop()
i = tree[i]
i = node
while i not in intree:
intree.add(i)
i = tree[i]
# Create a set to store the train edges
train_E = set()
# This is only relevant for directed graphs to make the selection of edge direction equiprobable
for e in set(zip(tree.keys(), tree.values())):
if random.random() < 0.5:
if e in E:
train_E.add(e)
else:
train_E.add(e[::-1])
else:
if e[::-1] in E:
train_E.add(e[::-1])
else:
train_E.add(e)
# Return the edges of the random spanning tree
return train_E
##################################################
# Helper functions #
##################################################
def _sanity_check(G):
"""
Helper function that checks if the input graphs contains a single connected component. Raises an error if not.
Parameters
----------
G : graph
A NetworkX graph or digraph.
Raises
------
ValueError
If the graph has more than one (weakly) connected component.
"""
# Compute the number of connected components
if G.is_directed():
num_ccs = nx.number_weakly_connected_components(G)
else:
num_ccs = nx.number_connected_components(G)
# Rise an error if more than one CC exists
if num_ccs != 1:
raise ValueError("Input graph should contain one (weakly) connected component. "
"This graph contains: " + str(num_ccs))
[docs]def store_edgelists(train_path, test_path, train_edges, test_edges):
"""
Writes the train and test edgelists to files with the specified names.
Parameters
----------
train_path : string
Indicates the path where the train data will be stored.
test_path : string
Indicates the path where the test data will be stored.
train_edges : array_like
Set of train edges.
test_edges : array_like
Set of test edges.
"""
# Convert edge-lists to numpy arrays
train_edges = np.array([list(edge_tuple) for edge_tuple in train_edges])
test_edges = np.array([list(edge_tuple) for edge_tuple in test_edges])
# Save the splits in different files
np.savetxt(fname=train_path, X=train_edges, delimiter=',', fmt='%d')
np.savetxt(fname=test_path, X=test_edges, delimiter=',', fmt='%d')
[docs]def store_train_test_splits(output_path, train_E, train_E_false, test_E, test_E_false, split_id=0):
"""
Writes the sets of train and test edges and non-edges to separate files in the provided path. All files will share
the same split number as an identifier. If any folder in the path does not exist, it will be generated.
Parameters
----------
output_path : string
Indicates the path where data will be stored.
train_E : set
Set of train edges.
train_E_false : set
Set of train non-edges.
test_E : set
Set of test edges.
test_E_false : set
Set of test non-edges.
split_id : int, optional
The ID of train/test split to be stored. Default is 0.
Returns
-------
filenames : list
A list of strings, the names and paths of the 4 files where the train and test edges and non-edges are stored.
See Also
--------
evalne.utils.preprocess.read_train_test : A function that can read the generated files.
Notes
-----
This function generates 4 files under <output_path> named: 'trE_<split_id>.csv', 'negTrE_<split_id>.csv',
'teE_<split_id>.csv', 'negTeE_<split_id>.csv' corresponding respectively to train edges train non-edges,
test edges and test non-edges.
Examples
--------
Writes some data to files under a folder named test:
>>> train_E = ((1,2), (2,3))
>>> train_E_false = ((-1,-2), (-2,-3))
>>> test_E = ((10,20), (20,30))
>>> test_E_false = ((-10,-20), (-20,-30))
>>> store_train_test_splits("./test", train_E, train_E_false, test_E, test_E_false, split_id=0)
('./test/trE_0.csv', './test/negTrE_0.csv', './test/teE_0.csv', './test/negTeE_0.csv')
"""
# Create path if it does not exist
if not os.path.exists(output_path):
os.makedirs(output_path)
# Convert edge-lists to numpy arrays
train_E = np.array([list(edge_tuple) for edge_tuple in train_E])
train_E_false = np.array([list(edge_tuple) for edge_tuple in train_E_false])
test_E = np.array([list(edge_tuple) for edge_tuple in test_E])
test_E_false = np.array([list(edge_tuple) for edge_tuple in test_E_false])
filenames = (os.path.join(output_path, "trE_{}.csv".format(split_id)),
os.path.join(output_path, "negTrE_{}.csv".format(split_id)),
os.path.join(output_path, "teE_{}.csv".format(split_id)),
os.path.join(output_path, "negTeE_{}.csv".format(split_id)))
# Save the splits in different files
np.savetxt(fname=filenames[0], X=train_E, delimiter=',', fmt='%d')
np.savetxt(fname=filenames[1], X=train_E_false, delimiter=',', fmt='%d')
np.savetxt(fname=filenames[2], X=test_E, delimiter=',', fmt='%d')
np.savetxt(fname=filenames[3], X=test_E_false, delimiter=',', fmt='%d')
# Return the names given to the 4 files where data is stored
return filenames
[docs]def check_overlap(filename, num_sets):
"""
Shows the amount of overlap (shared elements) between sets of node pairs with different split IDs. This function
assumes the existance of num_sets files sharing the same name and with split IDs starting from 0 to num_sets.
Parameters
----------
filename : string
Indicates the path and name (without split ID) of the first set.
The sets are assumed to have sequential split IDs starting at 0.
num_sets : int
The number of sets for which to check the overlap.
See Also
--------
evalne.utils.split_train_test.store_train_test_splits :
The filename is expected to follow the naming convention of this function without "_<split_id>.csv".
Examples
--------
Checking the overlap of several files in the test folder containing train edges:
>>> check_overlap("./test/tr_E", 2)
Intersection of 2 sets is 10
Union of 2 sets is 15
Jaccard coefficient: 0.666666666667
"""
# Load the first set and transform it into a list of tuples
S = np.loadtxt(filename + "_0.csv", delimiter=',', dtype=int)
S = set(map(tuple, S))
# Initialize the intersection and union sets as all elements in first edge set
intrs = S
union = S
# Sequentially add the rest of the sets and check overlap
for i in range(num_sets - 1):
# Read a new edge set
S = np.loadtxt(filename + "_{}.csv".format(i + 1), delimiter=',', dtype=int)
S = set(map(tuple, S))
# Update intersection and union sets
intrs = intrs & S
union = union | S
# Print the information on screen
print("Intersection of {} sets is {}".format(i + 2, len(intrs)))
print("Union of {} sets is {}".format(i + 2, len(union)))
print("Jaccard coefficient: {}".format(len(intrs) / len(union)))
print("")
[docs]def redges_false(train_E, test_E, output_path=None):
"""
For directed graphs computes all non-edges (a->b) such that the opposite edge (a<-b) exists in the graph.
It does this for both the train and test edge sets.
Parameters
----------
train_E : set
The set of train edges.
test_E : set
The set of test edges.
output_path : string, optional
A path or file where to store the results. If None, results are not stored only returned. Default is None.
Returns
-------
train_redges_false : set
A set of train edges respecting the mentioned property.
test_redges_false : set
A set of test edges respecting the mentioned property.
Notes
-----
These non-edges can be used to asses the performance of the embedding methods on predicting non-reciprocated edges.
"""
# Reverse all train and test edges
train_redges_false = set(tuple(reversed(edge_tuple)) for edge_tuple in train_E)
test_redges_false = set(tuple(reversed(edge_tuple)) for edge_tuple in test_E)
# Keep only the reversed edges which are not real train edges
train_redges_false = train_redges_false - train_E
# Keep only the test reversed edges which are not true edges in the graph
test_redges_false = test_redges_false - train_E
test_redges_false = test_redges_false - test_E
if output_path is not None:
# Store the reversed edges
train_redges_false_np = np.array([list(edge_tuple) for edge_tuple in train_redges_false])
test_redges_false_np = np.array([list(edge_tuple) for edge_tuple in test_redges_false])
# Save the splits in different files
np.savetxt(output_path, train_redges_false_np, delimiter=',', fmt='%d')
np.savetxt(output_path, test_redges_false_np, delimiter=',', fmt='%d')
# Return the computed sets
return train_redges_false, test_redges_false