Source code for evalne.utils.split_train_test

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018

# This file provides a set of functions for (1) edge sampling: sampling sets of train and test edges from given input
# graphs, (2) non-edge sampling: sampling sets of train and test non-edges i.e. pairs of unconnected nodes,
# (3) node-pair sampling: randomly sampling a subset of all possible node pairs including edges and non-edges.
# Additional functions for computing a spanning tree of a given graph are provided.

import os
import random
import warnings
import numpy as np
import networkx as nx

from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
from scipy.sparse import tril
from scipy.sparse import triu
from scipy.sparse.csgraph import depth_first_tree

from evalne.utils import preprocess as pp


##################################################
#                 Edge sampling                  #
##################################################

[docs]def split_train_test(G, train_frac=0.51, st_alg='wilson'):
    """
    Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
    spanning tree approach (see Notes). The resulting train edge set has the following properties: spans a graph
    (digraph) with a single connected (weakly connected) component and the same nodes as G.
    
    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph with a single connected (weakly connected) component.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.
    st_alg : string, optional
        The algorithm to use for generating the spanning tree constituting the backbone of the train set.
        Options are: 'wilson' and 'broder'. The first option, 'wilson' is faster in most cases. Default is 'wilson'.

    Returns
    -------
    train_E : set
       The set of train edges.
    test_E : set
       The set of test edges.

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].
        If the input graph G has more than one (weakly) connected component.

    See Also
    --------
    evalne.utils.split_train_test.broder_alg, evalne.utils.split_train_test.wilson_alg :
        The two options for the st_alg parameter.

    Notes
    -----
    The method proceeds as follows: (1) a spanning tree of the input graph is selected uniformly at random using broder
    or wilson's algorithm, (2) randomly selected edges are added to those of the spanning tree until train_frac is
    reached, (3) the remaining edges, not used in previous steps, form the test set.
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges), set()

    # Create a set of all edges in G
    E = set(G.edges)

    if st_alg == 'broder':
        # Compute a random spanning tree using broder's algorithm
        train_E = broder_alg(G, E)
    else:
        # Compute a random spanning tree using wilson's algorithm
        train_E = wilson_alg(G, E)

    # Fill test edge set as all edges not in the spanning tree
    test_E = E - train_E

    # Compute num train edges
    num_E = len(E)
    num_train_E = np.ceil(train_frac * num_E)

    # Check if the num edges in the spanning tree is already greater than the num train edges
    num_toadd = int(num_train_E - len(train_E))
    if num_toadd <= 0:
        print("WARNING: In order to return a connected train set the train_frac parameter needs to be higher!")
        print("In this case, the provided train set constitutes a random spanning tree of the input graph.")
        print("The train_frac value used is: {}".format(len(train_E) / num_E))
        print("Edges requested: train = {}, test = {}".format(num_train_E, num_E - num_train_E))
        print("Edges returned: train = {}, test = {}".format(len(train_E), num_E - len(train_E)))
    else:
        # Add more edges to train set from test set until it has desired size
        edges = set(random.sample(test_E, num_toadd))
        test_E = test_E - edges
        train_E = train_E | edges

    # Perform some simple checks
    assert E == (test_E | train_E)
    assert len(E) == len(test_E) + len(train_E)
    if num_toadd > 0:
        assert num_train_E == len(train_E)

    # Return the sets of edges
    return train_E, test_E


[docs]def quick_split(G, train_frac=0.51):
    """
    Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
    quick split approach (see Notes). The resulting train edge set has the following properties: spans a graph
    (digraph) with a single connected (weakly connected) component and the same nodes as G.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph with a single connected (weakly connected) component.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.

    Returns
    -------
    train_E : ndarray
       Column vector of train edges as pairs src, dst.
    test_E : ndarray
       Column vector of test edges as pairs src, dst.

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].
        If the input graph G has more than one (weakly) connected component.

    Notes
    -----
    The method proceeds as follows: (1) a spanning tree of the input graph is generated using a depth first tree
    approach starting at a random node, (2) randomly selected edges are added to those of the spanning tree until
    train_frac is reached, (3) the remaining edges, not used in previous steps, form the test set.
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges), set()

    # Get Adj matrix
    if nx.is_directed(G):
        a = nx.adjacency_matrix(G, nodelist=range(len(G.nodes)))
    else:
        a = triu(nx.adjacency_matrix(G, nodelist=range(len(G.nodes))), k=1)

    # Compute initial statistics and linear indx of nonzeros
    n = a.shape[0]
    num_tr_e = int(a.nnz * train_frac)
    nz_lin_ind = np.ravel_multi_index(a.nonzero(), (n, n))

    # Build a dft starting at a random node. If dir false returns only upper triangle
    dft = depth_first_tree(a, np.random.randint(0, a.shape[0]), directed=nx.is_directed(G))
    if nx.is_directed(G):
        dft_lin_ind = np.ravel_multi_index(dft.nonzero(), (n, n))
    else:
        dft_lin_ind = np.ravel_multi_index(triu(tril(dft).T + dft, k=1).nonzero(), (n, n))

    # From all nonzero indx remove those in dft. From the rest take enough to fill train quota. Rest are test
    rest_lin_ind = np.setdiff1d(nz_lin_ind, dft_lin_ind)
    aux = np.random.choice(rest_lin_ind, num_tr_e-len(dft_lin_ind), replace=False)
    lin_tr_e = np.union1d(dft_lin_ind, aux)
    lin_te_e = np.setdiff1d(rest_lin_ind, aux)

    # Unravel the linear indices to obtain src, dst pairs
    tr_e = np.array(np.unravel_index(np.array(lin_tr_e), (n, n))).T
    te_e = np.array(np.unravel_index(np.array(lin_te_e), (n, n))).T

    # Return the sets of edges
    return tr_e, te_e


[docs]def naive_split_train_test(G, train_frac=0.51):
    """
    Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
    naive split approach (see Notes). The resulting train edge set has the following properties: spans a graph
    (digraph) with a single connected (weakly connected) component and the same nodes as G.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph with a single connected (weakly connected) component.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.

    Returns
    -------
    train_E : set
       The set of train edges.
    test_E : set
        The set of test edges.

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].
        If the input graph G has more than one (weakly) connected component.

    Notes
    -----
    The method proceeds as follows: (1) compute the number of test edges needed from train_frac and size of G.
    (2) randomly remove edges from the input graph one at a time. If removing an edge causes the graph to become
    disconnected, add it back, otherwise, put it in the set of test edges. (3) repeat the previous step until all
    test edges are selected. (4) the remaining edges constitute the train set.
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges), set()

    # Is directed
    directed = G.is_directed()
    H = G.copy()

    # Create a set of all edges in G
    aux = np.array(H.edges)
    np.random.shuffle(aux)
    E = set([tuple(edge) for edge in aux])

    # Compute num train edges
    num_E = len(E)
    num_train_E = np.ceil(train_frac * num_E)
    num_test_E = num_E - num_train_E

    # Initialize the sets of train and test edges
    train_E = set(H.edges())
    test_E = set()

    # Iterate over shuffled edges, add to train/val sets
    for i, edge in enumerate(E):
        node1 = edge[0]
        node2 = edge[1]

        # If removing edge would disconnect a connected component, backtrack and move on
        H.remove_edge(node1, node2)
        if directed:
            if nx.number_weakly_connected_components(H) > 1:
                H.add_edge(node1, node2)
                continue
        else:
            if nx.number_connected_components(H) > 1:
                H.add_edge(node1, node2)
                continue

        # Fill test_edges
        if len(test_E) < num_test_E:
            test_E.add(edge)
            train_E.remove(edge)
        else:
            break

    # Perform some simple checks
    assert E == (test_E | train_E)
    assert len(E) == len(train_E) + len(test_E)

    # Return the sets of edges
    return train_E, test_E


[docs]def rand_split_train_test(G, train_frac=0.51):
    """
    Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using the
    random split approach (see Notes). The resulting train edge set has the following properties: spans a graph
    (digraph) with a single connected (weakly connected) component.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.

    Returns
    -------
    train_E : set
        The set of train edges.
    test_E : set
        The set of test edges.

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].

    Notes
    -----
    The method proceeds as follows: (1) randomly remove 1-train_frac percent of edges from the input graph.
    (2) from the remaining edges compute the main connected component and these will be the train edges. (3) from the
    set of removed edges, those such that both end nodes exist in the train edge set computed in the previous step,
    are added to the final test set.
    """
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges), set()

    # Create a set of all edges in G
    E = set(G.edges)
    num_E = len(E)

    # Compute the potential number of train and test edges which corresponds to the fraction given
    num_train_E = int(np.ceil(train_frac * num_E))
    num_test_E = int(num_E - num_train_E)

    # Randomly remove 1-train_frac edges from the graph and store them as potential test edges
    pte_edges = set(random.sample(E, num_test_E))

    # The remaining edges are potential train edges
    ptr_edges = E - pte_edges

    # Create a graph containing all ptr_edges and compute the mainCC
    if G.is_directed():
        H = nx.DiGraph()
        H.add_edges_from(ptr_edges)
        maincc = G.subgraph(max(nx.weakly_connected_components(H), key=len)).copy()
    else:
        H = nx.Graph()
        H.add_edges_from(ptr_edges)
        maincc = G.subgraph(max(nx.connected_components(H), key=len)).copy()

    # The edges in the mainCC graph are the actual train edges
    train_E = set(maincc.edges)

    # Remove potential test edges for which the end nodes do not exist in the train_E
    test_E = set()
    for (src, dst) in pte_edges:
        if src in maincc.nodes and dst in maincc.nodes:
            test_E.add((src, dst))

    # Return the sets of edges
    return train_E, test_E


[docs]def timestamp_split(G, train_frac=0.51):
    """
    Splits the edges of the input graph in sets of train and test and returns the results. Split is performed using edge
    timestamps (see Notes). The resulting train edge set has the following properties: spans a graph (digraph) with
    a single connected (weakly connected) component.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph where edge weights are timestamps.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.

    Returns
    -------
    train_E : ndarray
        Column vector of train edges as pairs src, dst.
    test_E : ndarray
        Column vector of test edges as pairs src, dst.
    tg : graph
        A NetworkX graph containing only the edges in the train edge set.

    Raises
    ------
    ValueError
        If the train_frac parameter is not in range (0, 1].
        If the input graph G has more than one (weakly) connected component.

    Notes
    -----
    The method proceeds as follows: (1) sort all edges by timestamp. (2) randomly remove 1-train_frac percent of edges
    from the input graph. (3) from the remaining edges compute the main connected component and these will be the train
    edges. (4) from the set of removed edges, those such that both end nodes exist in the train edge set computed in
    the previous step, are added to the final test set.
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)
    if train_frac <= 0.0 or train_frac > 1.0:
        raise ValueError('The train_frac parameter needs to be in range: (0.0, 1.0]')
    if train_frac == 1.0:
        return set(G.edges), set()

    # Get Adj matrix
    if nx.is_directed(G):
        a = nx.adjacency_matrix(G, nodelist=range(len(G.nodes)))
    else:
        a = triu(nx.adj_matrix(G, nodelist=range(len(G.nodes))), k=1)

    # Argsort data and compute the idx where we split train from test
    ordered = np.argsort(a.data)
    split_idx = int(len(ordered) * train_frac) - 1

    # Mask train edges and get all possible edges
    mask_tr = ordered > split_idx
    nz = a.nonzero()

    # Use the mask to select only train and test from nz
    # There will be no overlap between tr and te because nz contains only unique pairs
    tr_e = np.array((nz[0][~mask_tr], nz[1][~mask_tr])).T
    te_e = np.array((nz[0][mask_tr], nz[1][mask_tr])).T

    # Taking the most recent edges for testing can cause train to be disconnected so make sure it isn't
    tg = nx.Graph()
    tg.add_edges_from(tr_e)
    tg, ids = pp.prep_graph(tg, relabel=True, del_self_loops=True, maincc=True)
    tr_e = np.array(tg.edges)

    d = dict(ids)
    te_e = set(zip(te_e[:, 0], te_e[:, 1]))
    nte_e = map(lambda x: (d.get(x[0], -1), d.get(x[1], -1)), te_e)
    te_e = np.array(nte_e)

    # We now only keep the test edges between nodes in tg
    # Remove nodes that are in test but not train
    newn = np.setdiff1d(np.unique(te_e), np.unique(tr_e))
    mask = np.isin(te_e, newn).sum(axis=1).astype(bool)
    te_e = te_e[~mask, :]

    # Return the sets of edges
    return tr_e, te_e, tg


##################################################
#               Non-edge sampling                #
##################################################


[docs]def generate_false_edges_owa(G, train_E, test_E, num_fe_train=None, num_fe_test=None):
    """
    Randomly samples pairs of unconnected nodes or non-edges from the input graph according to the open world
    assumption (see Notes). Returns sets of train and test non-edges.

    Parameters
    ----------
    G : graph
       A NetworkX graph or digraph.
    train_E : set
       The set of train edges.
    test_E : set
       The set of test edges.
    num_fe_train : int, optional
       The number of train non-edges to generate. Default is None (same number as train edges).
    num_fe_test : int, optional
       The number of test non-edges to generate. Default is None (same number as test edges).

    Returns
    -------
    train_E_false : set
       The set of train non-edges.
    test_E_false : set
       The set of test non-edges.

    Raises
    ------
    ValueError
        If the input graph G has more than one (weakly) connected component.
        If more non-edges than existing in the graph are required.

    Notes
    -----
    The open world assumption considers that for generating train non-edges one has access to the train edges only.
    Therefore, train non-edges could overlap with test edges. This scenario is common for evolving graphs where edges
    can only appear. In this case, one has access to certain train edges present in the graph at time t. Non-edges
    are then sampled using this information. At time t+1, the newly arrived test edges may have been previously
    selected as train non-edges. For undirected graphs the output is sorted (smallNodeID, bigNodeID).
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)

    # Create a set of vertices
    V = set(G.nodes)

    # Initialize the sizes of the non-edges
    if num_fe_train is None:
        num_fe_train = len(train_E)

    if num_fe_test is None:
        num_fe_test = len(test_E)

    # Make sure the required amount of non-edges can be generated
    max_nonedges = len(V) * len(V) - len(train_E)
    if num_fe_train > max_nonedges:
        raise ValueError('Too many train non-edges required! Max available for train+test is {}'.format(max_nonedges))
    else:
        if num_fe_train + num_fe_test > max_nonedges:
            warnings.warn('Too many non-edges required in train+test! '
                          'Using maximum number of test non-edges available: {}'.format(max_nonedges - num_fe_train))
            num_fe_test = max_nonedges - num_fe_train

    # Create sets to store the non-edges
    train_E_false = set()
    test_E_false = set()

    # Generate train non-edges
    while len(train_E_false) < num_fe_train:
        edge = tuple(random.sample(V, 2))
        redge = tuple(reversed(edge))
        if edge not in train_E:
            if G.is_directed():
                train_E_false.add(edge)
            else:
                if redge not in train_E:
                    train_E_false.add(tuple(sorted(edge)))

    # Generate test non-edges
    while len(test_E_false) < num_fe_test:
        edge = tuple(random.sample(V, 2))
        redge = tuple(reversed(edge))
        if edge not in train_E and edge not in test_E and edge not in train_E_false:
            if G.is_directed():
                test_E_false.add(edge)
            else:
                if redge not in train_E and redge not in test_E and redge not in train_E_false:
                    test_E_false.add(tuple(sorted(edge)))

    # Perform some simple check before returning the result
    assert len(train_E_false) == num_fe_train
    assert len(test_E_false) == num_fe_test
    assert train_E_false.isdisjoint(test_E_false)
    assert train_E_false.isdisjoint(train_E)
    assert test_E_false.isdisjoint(train_E | test_E)

    # Return the sets of non-edges
    return train_E_false, test_E_false


[docs]def generate_false_edges_cwa(G, train_E, test_E, num_fe_train=None, num_fe_test=None):
    """
    Randomly samples pairs of unconnected nodes or non-edges from the input graph according to the closed world
    assumption (see Notes). Returns sets of train and test non-edges.

    Parameters
    ----------
    G : graph
       A NetworkX graph or digraph.
    train_E : set
       The set of train edges.
    test_E : set
       The set of test edges.
    num_fe_train : int, optional
       The number of train non-edges to generate. Default is None (same number as train edges).
    num_fe_test : int, optional
       The number of test non-edges to generate. Default is None (same number as test edges).

    Returns
    -------
    train_E_false : set
       The set of train non-edges.
    test_E_false : set
       The set of test non-edges.

    Raises
    ------
    ValueError
        If the input graph G has more than one (weakly) connected component.
        If more non-edges than existing in the graph are required.

    Notes
    -----
    The closed world assumption considers that one is certain about some node pairs being non-edges. Therefore,
    train non-edges cannot overlap with wither train not test edges. This scenario is common for static graphs
    where information about both the edges (positive interactions) and non-edges (negative interactions) is knows,
    e.g. protein protein interaction networks.
    """
    # Sanity check to make sure the input is correct
    _sanity_check(G)

    # Create a set of vertices
    V = set(G.nodes)

    # Initialize the sizes of the non-edges
    if num_fe_train is None:
        num_fe_train = len(train_E)

    if num_fe_test is None:
        num_fe_test = len(test_E)

    # Make sure the required amount of non-edges can be generated
    max_nonedges = len(V) * len(V) - len(G.edges)
    if num_fe_train > max_nonedges:
        raise ValueError(
            'Too many train non-edges required! Max available for train+test is {}'.format(max_nonedges))
    else:
        if num_fe_train + num_fe_test > max_nonedges:
            warnings.warn('Too many non-edges required in train+test! '
                          'Using maximum number of test non-edges available: {}'.format(max_nonedges - num_fe_train))
            # num_fe_test = max_nonedges - num_fe_train
            return _getall_false_edges(G, (1.0*num_fe_train)/max_nonedges)

    # Create sets to store the non-edges
    train_E_false = set()
    test_E_false = set()

    # Generate train non-edges
    while len(train_E_false) < num_fe_train:
        edge = tuple(random.sample(V, 2))
        redge = tuple(reversed(edge))
        if edge not in train_E and edge not in test_E:
            if G.is_directed():
                train_E_false.add(edge)
            else:
                if redge not in train_E and redge not in test_E:
                    train_E_false.add(tuple(sorted(edge)))

    # Generate test non-edges
    while len(test_E_false) < num_fe_test:
        edge = tuple(random.sample(V, 2))
        redge = tuple(reversed(edge))
        if edge not in train_E and edge not in test_E and edge not in train_E_false:
            if G.is_directed():
                test_E_false.add(edge)
            else:
                if redge not in train_E and redge not in test_E and redge not in train_E_false:
                    test_E_false.add(tuple(sorted(edge)))

    # Perform some simple check before returning the result
    assert len(train_E_false) == num_fe_train
    assert len(test_E_false) == num_fe_test
    assert train_E_false.isdisjoint(test_E_false)
    assert train_E_false.isdisjoint(train_E | test_E)
    assert test_E_false.isdisjoint(train_E | test_E)

    # Return the sets of non-edges
    return train_E_false, test_E_false


def _getall_false_edges(G, fe_train_frac):
    """
    Helper function for generating all non-edge pairs possible and splitting them into train and test sets.

    Parameters
    ----------
    G : graph
       A NetworkX graph or digraph.
    fe_train_frac : float
        The proportion of train non-edges w.r.t. the total number of non-edges required (range (0.0, 1.0]).

    Returns
    -------
    train_E_false : set
       The set of train non-edges.
    test_E_false : set
       The set of test non-edges.
    """
    print("Generating all non-edges and splitting them in train and test...")
    train_E_false = list()
    test_E_false = list()
    for e in nx.non_edges(G):
        r = random.uniform(0, 1)
        if r <= fe_train_frac:
            train_E_false.append(e)
        else:
            test_E_false.append(e)

    # Return the sets of non-edges
    return train_E_false, test_E_false


[docs]def quick_nonedges(G, train_frac=0.51, fe_ratio=1.0):
    """
    Randomly samples pairs of unconnected nodes or non-edges from the input graph according to the open world
    assumption. Is a more efficient implementation of `generate_false_edges_owa` which also does not require the
    train and test edge sets as input. Returns sets of train and test non-edges.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.
    fe_ratio : float, optional
        The ratio of non-edges to edges to sample. For fr_ratio > 0 and < 1 less non-edges than edges will be
        generated. For fe_edges > 1 more non-edges than edges will be generated. Default 1, same amounts.

    Returns
    -------
    train_E_false : ndarray
       Column vector of train non-edges as pairs src, dst.
    test_E_false : ndarray
       Column vector of test non-edges as pairs src, dst.

    Raises
    ------
    ValueError
        If more non-edges than existing in the graph are required.
    """
    # fe_ration can be any float or keyword 'prop'
    a = nx.adjacency_matrix(G, nodelist=range(len(G.nodes)))
    n = a.shape[0]
    density = a.nnz / n ** 2
    if fe_ratio == 'prop':
        fe_ratio = np.floor(1.0 / density)
    if not nx.is_directed(G):
        num_fe = int((a.nnz/2.0) * fe_ratio)
    else:
        num_fe = int(a.nnz * fe_ratio)
    num_fe_tr = int(train_frac * num_fe)

    # Make sure we have enough non-edges
    if num_fe > (n**2 - (a.nnz + n)):
        raise ValueError('Too many non-edges required!')

    # Get linear indexes of 1s in A
    lin_indexes = np.ravel_multi_index(a.nonzero(), (n, n))
    inv_indx = np.union1d(lin_indexes, np.ravel_multi_index(np.diag_indices(n), (n, n)))
    # we could generate more FE than we need to make sure we find enough 0s
    candidates = np.random.randint(0, n**2, size=int(num_fe/(1-density)))

    # make sure there is no overlap
    fe_lin_ind = np.setdiff1d(candidates, inv_indx)

    while len(fe_lin_ind) < num_fe:
        new_cands = np.random.randint(0, n ** 2, size=num_fe-len(fe_lin_ind))
        valid_cands = np.setdiff1d(new_cands, inv_indx)
        fe_lin_ind = np.union1d(fe_lin_ind, valid_cands)

    fe_lin_ind = fe_lin_ind[:num_fe]
    aux = np.array(np.unravel_index(fe_lin_ind, (n, n))).T
    fe_tr = aux[:num_fe_tr, :]
    fe_te = aux[num_fe_tr:, :]

    # Return the sets of non-edges
    return fe_tr, fe_te


##################################################
#              Node-pair Sampling                #
##################################################


def _compute_one_split(G, output_path, owa=True, train_frac=0.51, num_fe_train=None, num_fe_test=None, split_id=0):
    """
    Splits the edges of the input graph in sets of train and test using the spanning tree approach and generates sets
    of train and test non-edges. The four resulting sets of node pairs are written to different files.
    The resulting train edge set has the following properties: spans a graph (digraph) with a single connected
    (weakly connected) component and the same nodes as G.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph with a single connected (weakly connected) component.
    output_path : string
        Indicates the path where the output will be stored.
    owa : bool, optional
        Encodes the belief that the network respects or not the open world assumption. Default is True.
        If owa=True, train non-edges are sampled from the train graph only and can overlap with test edges.
        If owa=False, train non-edges are sampled from the full graph and cannot overlap with test edges.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.
    num_fe_train : int, optional
        The number of train non-edges to generate. Default is None (same number as train edges).
    num_fe_test : int, optional
        The number of test non-edges to generate. Default is None (same number as test edges).
    split_id : int, optional
        A unique ID for this train/test split. Default is 0.

    See also
    --------
    evalne.utils.split_train_test.split_train_test :
        Method used to split graph edges in train and test sets.
    evalne.utils.split_train_test.generate_false_edges_owa :
        Method used to generate non-edges if owa=True.
    evalne.utils.split_train_test.generate_false_edges_cwa :
        Method used to generate non-edges if owa=False.
    """
    # Generate train and test edge splits
    train_E, test_E = split_train_test(G, train_frac)

    # Generate the train/test false edges
    if owa:
        train_E_false, test_E_false = generate_false_edges_owa(G, train_E, test_E, num_fe_train, num_fe_test)
    else:
        train_E_false, test_E_false = generate_false_edges_cwa(G, train_E, test_E, num_fe_train, num_fe_test)

    #  Write the computed split to a file
    store_train_test_splits(output_path, train_E, train_E_false, test_E, test_E_false, split_id)


[docs]def compute_splits_parallel(G, output_path, owa=True, train_frac=0.51, num_fe_train=None, num_fe_test=None,
                            num_splits=10):
    """
    Computes in parallel the required number of train/test splits of input graph edges. Also generated the same number
    of train and test non-edge sets. The resulting sets of node pairs are written to different files. The resulting
    train edge set has the following properties: spans a graph (digraph) with a single connected (weakly connected)
    component and the same nodes as G.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph with a single connected (weakly connected) component.
    output_path : string
        Indicates the path where data will be stored. Can include a name for all splits to share.
    owa : bool, optional
        Encodes the belief that the network respects or not the open world assumption. Default is True.
        If owa=True, train non-edges are sampled from the train graph only and can overlap with test edges.
        If owa=False, train non-edges are sampled from the full graph and cannot overlap with test edges.
    train_frac : float, optional
        The proportion of train edges w.r.t. the total number of edges in the input graph (range (0.0, 1.0]).
        Default is 0.51.
    num_fe_train : int, optional
        The number of train non-edges to generate. Default is None (same number as train edges).
    num_fe_test : int, optional
        The number of test non-edges to generate. Default is None (same number as test edges).
    num_splits : int, optional
        The number of train/test splits to generate. Default is 10.

    See also
    --------
    evalne.utils.split_train_test.split_train_test :
        This method used to split graph edges in train and test.
    evalne.utils.split_train_test.generate_false_edges_owa :
        Method used to generate non-edges if owa=True.
    evalne.utils.split_train_test.generate_false_edges_cwa :
        Method used to generate non-edges if owa=False.
    """
    # Compute the splits sequentially or in parallel
    backend = 'multiprocessing'
    path_func = delayed(_compute_one_split)
    Parallel(n_jobs=num_splits, verbose=True, backend=backend)(
        path_func(G, output_path, owa, train_frac, num_fe_train, num_fe_test, split) for split in range(num_splits))


[docs]def random_edge_sample(a, samp_frac=0.01, directed=False, sample_diag=False):
    """
    Samples uniformly at random positions from the input adjacency matrix. The samples are returned in two sets, one
    containing all edges sampled and one containing all non-edges.

    Parameters
    ----------
    a : sparse matrix
        A sparse adjacency matrix.
    samp_frac : float, optional
        A float in range [0,1] representing the fraction of all edge pairs to sample. Default is 0.01 (1%)
    directed : bool, optional
        A flag indicating if the adjacency matrix should be considered directed or undirected. If undirected,
        indices are obtained only from the lower triangle. Default is False.
    sample_diag : bool, optional
        If True elements from the main diagonal are also sampled (i.e. self loops). Else they are excluded from the
        sampling process. Default is False.

    Returns
    -------
    pos_e : ndarray
        Sampled node pairs that are edges.
    neg_e : ndarray
        Sampled node pairs that are non-edges.
    """
    if samp_frac > 1.0 or samp_frac < 0.0:
        raise ValueError("The samp_frac parameter is expected to be in range [0,1].")

    n = a.shape[0]
    num_samp = int(n ** 2 * samp_frac)

    # Generate a mask of sampled elements
    lin_ind = np.random.choice(n ** 2, size=num_samp, replace=False)
    j = np.floor(lin_ind * 1. / n).astype(int, copy=False)
    i = (lin_ind - j * n).astype(int, copy=False)
    v = np.ones(num_samp, dtype=bool)
    mask = csr_matrix((v, (i, j)), shape=(n, n))

    if directed:
        if not sample_diag:
            mask.setdiag(0)
            mask.eliminate_zeros()
    else:
        # For undir graphs we only look at the upper triangle
        if sample_diag:
            mask = triu(mask, k=0)
        else:
            mask = triu(mask, k=1)

    # Extract the linear indices of the nonzeros in a
    lin_indx_samp = np.ravel_multi_index(mask.nonzero(), (n, n))

    # All positive edges sampled in mask will stay in aux
    aux = mask.multiply(a)
    pos_e = np.array(aux.nonzero()).T

    # The rest of the lin indx not positive are negative
    lin_indx_ne = np.setdiff1d(lin_indx_samp, np.ravel_multi_index(aux.nonzero(), (n, n)))
    neg_e = np.array(np.unravel_index(lin_indx_ne, (n, n))).T

    # Return the node pairs
    return pos_e, neg_e


##################################################
#          Spanning tree computation             #
##################################################


[docs]def broder_alg(G, E):
    """
    Runs Andrei Broder's algorithm [1]_ to select uniformly at random a spanning tree of the input graph.
    The method works for directed and undirected networks. The edge directions in the resulting spanning tree
    are taken from the input edgelist (E). For node pairs where both edge directions exist, one is chosen at random.

    Parameters
    ----------
    G : graph
       A NetworkX graph or digraph.
    E : set
       A set of all directed or undirected edges in G.

    Returns
    -------
    train_E : set
       A set of edges of G that form the random spanning tree.

    See Also
    --------
    evalne.utils.split_train_test.wilson_alg :
        A more computationally efficient approach for selecting a spanning tree.

    References
    ----------
    .. [1] A. Broder, "Generating Random Spanning Trees", Proc. of the 30th Annual Symposium
           on Foundations of Computer Science, pp. 442--447, 1989.
    """
    # Create two partitions, S and T. Initially store all nodes in S.
    S = set(G.nodes)
    T = set()

    # Pick a random node as the "current node" and mark it as visited.
    current_node = random.sample(S, 1).pop()
    S.remove(current_node)
    T.add(current_node)

    # Perform random walk on the graph
    train_E = set()
    while S:
        if G.is_directed():
            neighbour_node = random.sample(list(G.successors(current_node)) + list(G.predecessors(current_node)), 1).pop()
        else:
            neighbour_node = random.sample(list(G.neighbors(current_node)), 1).pop()
        if neighbour_node not in T:
            S.remove(neighbour_node)
            T.add(neighbour_node)
            if random.random() < 0.5:
                if (current_node, neighbour_node) in E:
                    train_E.add((current_node, neighbour_node))
                else:
                    train_E.add((neighbour_node, current_node))
            else:
                if (neighbour_node, current_node) in E:
                    train_E.add((neighbour_node, current_node))
                else:
                    train_E.add((current_node, neighbour_node))
        current_node = neighbour_node

    # Return the set of edges constituting the spanning tree
    return train_E


[docs]def wilson_alg(G, E):
    """
    Runs Willson's algorithm [2]_ also known as loop erasing random walk to select uniformly at random a spanning tree
    of the input graph. The method works for directed and undirected networks [3]_. The edge directions in the resulting
    spanning tree are taken from the input edgelist (E). For node pairs where both edge directions exist, one is
    chosen at random.

    Parameters
    ----------
    G : graph
       A NetworkX graph or digraph.
    E : set
       A set of all directed or undirected edges in G.

    Returns
    -------
    train_E : set
       A set of edges of G that form the random spanning tree.

    See Also
    --------
    evalne.utils.split_train_test.broder_alg :
        A different approach for selecting a spanning tree that could be faster for specific graphs.

    References
    ----------
    .. [2] D. B. Wilson, "Generating Random Spanning Trees More Quickly than the Cover Time",
           In Proceedings of STOC, pp. 296--303, 1996.
    .. [3] J. G. Propp and D. B. Wilson, "How to Get a Perfectly Random Sample from a Generic
           Markov Chain and Generate a Random Spanning Tree of a Directed Graph",
           Journal of Algorithms 27, pp. 170--217, 1998.
    """
    # Stores the nodes which are part of the trees created by the LERW.
    intree = set()

    # A dictionary which works as a linked list and stores the spanning tree
    tree = dict()

    # Pick a random node as the root of the spanning tree and add it to intree
    # For undirected graphs this is the correct approach
    r = random.sample(G.nodes, 1).pop()
    intree.add(r)

    for node in G.nodes:
        i = node
        while i not in intree:
            # This random successor works for weighted and unweighted graphs because we just
            # want to select a bunch of edges from the graph, no matter what the weights are.
            if G.is_directed():
                tree[i] = random.sample(list(G.successors(i)) + list(G.predecessors(i)), 1).pop()
            else:
                tree[i] = random.sample(list(G.neighbors(i)), 1).pop()
            i = tree[i]
        i = node
        while i not in intree:
            intree.add(i)
            i = tree[i]

    # Create a set to store the train edges
    train_E = set()

    # This is only relevant for directed graphs to make the selection of edge direction equiprobable
    for e in set(zip(tree.keys(), tree.values())):
        if random.random() < 0.5:
            if e in E:
                train_E.add(e)
            else:
                train_E.add(e[::-1])
        else:
            if e[::-1] in E:
                train_E.add(e[::-1])
            else:
                train_E.add(e)

    # Return the edges of the random spanning tree
    return train_E


##################################################
#               Helper functions                 #
##################################################


def _sanity_check(G):
    """
    Helper function that checks if the input graphs contains a single connected component. Raises an error if not.

    Parameters
    ----------
    G : graph
       A NetworkX graph or digraph.

    Raises
    ------
    ValueError
        If the graph has more than one (weakly) connected component.
    """
    # Compute the number of connected components
    if G.is_directed():
        num_ccs = nx.number_weakly_connected_components(G)
    else:
        num_ccs = nx.number_connected_components(G)

    # Rise an error if more than one CC exists
    if num_ccs != 1:
        raise ValueError("Input graph should contain one (weakly) connected component. "
                         "This graph contains: " + str(num_ccs))


[docs]def store_edgelists(train_path, test_path, train_edges, test_edges):
    """
    Writes the train and test edgelists to files with the specified names.

    Parameters
    ----------
    train_path : string
       Indicates the path where the train data will be stored.
    test_path : string
       Indicates the path where the test data will be stored.
    train_edges : array_like
       Set of train edges.
    test_edges : array_like
       Set of test edges.
    """
    # Convert edge-lists to numpy arrays
    train_edges = np.array([list(edge_tuple) for edge_tuple in train_edges])
    test_edges = np.array([list(edge_tuple) for edge_tuple in test_edges])

    # Save the splits in different files
    np.savetxt(fname=train_path, X=train_edges, delimiter=',', fmt='%d')
    np.savetxt(fname=test_path, X=test_edges, delimiter=',', fmt='%d')


[docs]def store_train_test_splits(output_path, train_E, train_E_false, test_E, test_E_false, split_id=0):
    """
    Writes the sets of train and test edges and non-edges to separate files in the provided path. All files will share
    the same split number as an identifier. If any folder in the path does not exist, it will be generated.

    Parameters
    ----------
    output_path : string
       Indicates the path where data will be stored.
    train_E : set
       Set of train edges.
    train_E_false : set
       Set of train non-edges.
    test_E : set
       Set of test edges.
    test_E_false : set
       Set of test non-edges.
    split_id : int, optional
       The ID of train/test split to be stored. Default is 0.

    Returns
    -------
    filenames : list
        A list of strings, the names and paths of the 4 files where the train and test edges and non-edges are stored.

    See Also
    --------
    evalne.utils.preprocess.read_train_test : A function that can read the generated files.

    Notes
    -----
    This function generates 4 files under <output_path> named: 'trE_<split_id>.csv', 'negTrE_<split_id>.csv',
    'teE_<split_id>.csv', 'negTeE_<split_id>.csv' corresponding respectively to train edges train non-edges,
    test edges and test non-edges.

    Examples
    --------
    Writes some data to files under a folder named test:

    >>> train_E = ((1,2), (2,3))
    >>> train_E_false = ((-1,-2), (-2,-3))
    >>> test_E = ((10,20), (20,30))
    >>> test_E_false = ((-10,-20), (-20,-30))
    >>> store_train_test_splits("./test", train_E, train_E_false, test_E, test_E_false, split_id=0)
    ('./test/trE_0.csv', './test/negTrE_0.csv', './test/teE_0.csv', './test/negTeE_0.csv')

    """
    # Create path if it does not exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Convert edge-lists to numpy arrays
    train_E = np.array([list(edge_tuple) for edge_tuple in train_E])
    train_E_false = np.array([list(edge_tuple) for edge_tuple in train_E_false])
    test_E = np.array([list(edge_tuple) for edge_tuple in test_E])
    test_E_false = np.array([list(edge_tuple) for edge_tuple in test_E_false])

    filenames = (os.path.join(output_path, "trE_{}.csv".format(split_id)),
                 os.path.join(output_path, "negTrE_{}.csv".format(split_id)),
                 os.path.join(output_path, "teE_{}.csv".format(split_id)),
                 os.path.join(output_path, "negTeE_{}.csv".format(split_id)))

    # Save the splits in different files
    np.savetxt(fname=filenames[0], X=train_E, delimiter=',', fmt='%d')
    np.savetxt(fname=filenames[1], X=train_E_false, delimiter=',', fmt='%d')
    np.savetxt(fname=filenames[2], X=test_E, delimiter=',', fmt='%d')
    np.savetxt(fname=filenames[3], X=test_E_false, delimiter=',', fmt='%d')

    # Return the names given to the 4 files where data is stored
    return filenames


[docs]def check_overlap(filename, num_sets):
    """
    Shows the amount of overlap (shared elements) between sets of node pairs with different split IDs. This function
    assumes the existance of num_sets files sharing the same name and with split IDs starting from 0 to num_sets.

    Parameters
    ----------
    filename : string
       Indicates the path and name (without split ID) of the first set.
       The sets are assumed to have sequential split IDs starting at 0.
    num_sets : int
       The number of sets for which to check the overlap.

    See Also
    --------
    evalne.utils.split_train_test.store_train_test_splits :
        The filename is expected to follow the naming convention of this function without "_<split_id>.csv".

    Examples
    --------
    Checking the overlap of several files in the test folder containing train edges:

    >>> check_overlap("./test/tr_E", 2)
    Intersection of 2 sets is 10
    Union of 2 sets is 15
    Jaccard coefficient: 0.666666666667

    """
    # Load the first set and transform it into a list of tuples
    S = np.loadtxt(filename + "_0.csv", delimiter=',', dtype=int)
    S = set(map(tuple, S))

    # Initialize the intersection and union sets as all elements in first edge set
    intrs = S
    union = S

    # Sequentially add the rest of the sets and check overlap
    for i in range(num_sets - 1):
        # Read a new edge set
        S = np.loadtxt(filename + "_{}.csv".format(i + 1), delimiter=',', dtype=int)
        S = set(map(tuple, S))
        # Update intersection and union sets
        intrs = intrs & S
        union = union | S
        # Print the information on screen
        print("Intersection of {} sets is {}".format(i + 2, len(intrs)))
        print("Union of {} sets is {}".format(i + 2, len(union)))
        print("Jaccard coefficient: {}".format(len(intrs) / len(union)))
        print("")


[docs]def redges_false(train_E, test_E, output_path=None):
    """
    For directed graphs computes all non-edges (a->b) such that the opposite edge (a<-b) exists in the graph.
    It does this for both the train and test edge sets.

    Parameters
    ----------
    train_E : set
       The set of train edges.
    test_E : set
       The set of test edges.
    output_path : string, optional
        A path or file where to store the results. If None, results are not stored only returned. Default is None.

    Returns
    -------
    train_redges_false : set
        A set of train edges respecting the mentioned property.
    test_redges_false : set
        A set of test edges respecting the mentioned property.

    Notes
    -----
    These non-edges can be used to asses the performance of the embedding methods on predicting non-reciprocated edges.
    """
    # Reverse all train and test edges
    train_redges_false = set(tuple(reversed(edge_tuple)) for edge_tuple in train_E)
    test_redges_false = set(tuple(reversed(edge_tuple)) for edge_tuple in test_E)

    # Keep only the reversed edges which are not real train edges
    train_redges_false = train_redges_false - train_E

    # Keep only the test reversed edges which are not true edges in the graph
    test_redges_false = test_redges_false - train_E
    test_redges_false = test_redges_false - test_E

    if output_path is not None:
        # Store the reversed edges
        train_redges_false_np = np.array([list(edge_tuple) for edge_tuple in train_redges_false])
        test_redges_false_np = np.array([list(edge_tuple) for edge_tuple in test_redges_false])
        # Save the splits in different files
        np.savetxt(output_path, train_redges_false_np, delimiter=',', fmt='%d')
        np.savetxt(output_path, test_redges_false_np, delimiter=',', fmt='%d')

    # Return the computed sets
    return train_redges_false, test_redges_false