Source code for evalne.methods.similarity

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018

# This file provides implementations of a variety heuristics for computing node-pair similarities. These heuristics are
# commonly used as baselines for network embedding evaluation. All functions of the networkx.link_prediction module are
# reimplemented here and extended to directed graphs (following https://surface.syr.edu/etd/355/).
# MultiGraphs and weighted graphs are not supported.

# TODO: the apply_prediction method should probably return a numpy array (like edge_embeddings does) rather than a list.

import math
import random
import numpy as np
import networkx as nx

__all__ = ['common_neighbours',
           'jaccard_coefficient',
           'cosine_similarity',
           'lhn_index',
           'topological_overlap',
           'adamic_adar_index',
           'resource_allocation_index',
           'preferential_attachment',
           'random_prediction',
           'all_baselines']


def _apply_prediction(G, func, ebunch=None):
    r"""
    Applies the given function to each node-pair in ebunch, if this is provided, otherwise, it applies the function to
    all edges in G.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    func : func
        A function on two inputs each being a node in the graph. Can return anything, but it should return a value
        representing the likelihood of a "link" between the two nodes.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    """
    if ebunch is None:
        ebunch = list(G.edges)
    return list(map(lambda e: func(e[0], e[1]), ebunch))


[docs]def common_neighbours(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the common neighbours similarity between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the common neighbours similarity of nodes 'u' and 'v' is defined as:

    .. math:: |\Gamma(u) \cap \Gamma(v)|

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        |\Gamma_i(u) \cap \Gamma_i(v)|

        |\Gamma_o(u) \cap \Gamma_o(v)|

    """
    def predict(u, v):
        return len(set(G[u]) & set(G[v]))

    def predict_in(u, v):
        su = set(map(lambda e: e[0], G.in_edges(u)))
        sv = set(map(lambda e: e[0], G.in_edges(v)))
        return len(su & sv)

    def predict_out(u, v):
        su = set(map(lambda e: e[1], G.out_edges(u)))
        sv = set(map(lambda e: e[1], G.out_edges(v)))
        return len(su & sv)

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def jaccard_coefficient(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the Jaccard coefficient between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the Jaccard coefficient of nodes 'u' and 'v' is defined as:

    .. math:: |\Gamma(u) \cap \Gamma(v)| / |\Gamma(u) \cup \Gamma(v)|

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{|\Gamma_i(u) \cup \Gamma_i(v)|}

        \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{|\Gamma_o(u) \cup \Gamma_o(v)|}

    """
    def predict(u, v):
        union_size = len(set(G[u]) | set(G[v]))
        if union_size == 0:
            return 0
        return len(list(nx.common_neighbors(G, u, v))) / union_size

    def predict_in(u, v):
        su = set(map(lambda e: e[0], G.in_edges(u)))
        sv = set(map(lambda e: e[0], G.in_edges(v)))
        union_size = len(su | sv)
        if union_size == 0:
            return 0
        return len(su & sv) / union_size

    def predict_out(u, v):
        su = set(map(lambda e: e[1], G.out_edges(u)))
        sv = set(map(lambda e: e[1], G.out_edges(v)))
        union_size = len(su | sv)
        if union_size == 0:
            return 0
        return len(su & sv) / union_size

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def cosine_similarity(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the cosine similarity between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the cosine similarity of nodes 'u' and 'v' is defined as:

    .. math:: \frac{|\Gamma(u) \cap \Gamma(v)|}{\sqrt{|\Gamma(u)| |\Gamma(v)|}}

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{\sqrt{|\Gamma_i(u)| |\Gamma_i(v)|}}

        \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{\sqrt{|\Gamma_o(u)| |\Gamma_o(v)|}}

    """
    def predict(u, v):
        den = math.sqrt(len(set(G[u])) * len(set(G[v])))
        if den == 0:
            return 0
        return len(list(nx.common_neighbors(G, u, v))) / den

    def predict_in(u, v):
        su = set(map(lambda e: e[0], G.in_edges(u)))
        sv = set(map(lambda e: e[0], G.in_edges(v)))
        den = math.sqrt(len(su) * len(sv))
        if den == 0:
            return 0
        return len(su & sv) / den

    def predict_out(u, v):
        su = set(map(lambda e: e[1], G.out_edges(u)))
        sv = set(map(lambda e: e[1], G.out_edges(v)))
        den = math.sqrt(len(su) * len(sv))
        if den == 0:
            return 0
        return len(su & sv) / den

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def lhn_index(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the Leicht-Holme-Newman index [1]_ between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the Leicht-Holme-Newman index of nodes 'u' and 'v' is defined as:

    .. math:: \frac{|\Gamma(u) \cap \Gamma(v)|}{|\Gamma(u)| |\Gamma(v)|}

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{|\Gamma_i(u)| |\Gamma_i(v)|}

        \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{|\Gamma_o(u)| |\Gamma_o(v)|}

    References
    ----------
    .. [1] Leicht, E. A. and Holme, Petter and Newman, M. E. J. (2006).
           "Vertex similarity in networks.", Phys. Rev. E, 73, 10.1103/PhysRevE.73.026120.
    """
    def predict(u, v):
        den = G.degree(u) * G.degree(v)
        if den == 0:
            return 0
        return len(list(nx.common_neighbors(G, u, v))) / den

    def predict_in(u, v):
        su = set(map(lambda e: e[0], G.in_edges(u)))
        sv = set(map(lambda e: e[0], G.in_edges(v)))
        den = len(su) * len(sv)
        if den == 0:
            return 0
        return len(su & sv) / den

    def predict_out(u, v):
        su = set(map(lambda e: e[1], G.out_edges(u)))
        sv = set(map(lambda e: e[1], G.out_edges(v)))
        den = len(su) * len(sv)
        if den == 0:
            return 0
        return len(su & sv) / den

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def topological_overlap(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the topological overlap [2]_ between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the topological overlap of nodes 'u' and 'v' is defined as:

    .. math:: \frac{|\Gamma(u) \cap \Gamma(v)|}{min(|\Gamma(u)|,|\Gamma(v)|)}

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{min(|\Gamma_i(u)|,|\Gamma_i(v)|)}

        \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{min(|\Gamma_o(u)|,|\Gamma_o(v)|)}

    References
    ----------
    .. [2] Ravasz, E., Somera, A. L., Mongru, D. A., Oltvai, Z. N., & Barabási, A. L. (2002).
           "Hierarchical organization of modularity in metabolic networks." Science, 297(5586), 1551-1555.
    """
    def predict(u, v):
        den = min(G.degree(u), G.degree(v))
        if den == 0:
            return 0
        return len(list(nx.common_neighbors(G, u, v))) / den

    def predict_in(u, v):
        su = set(map(lambda e: e[0], G.in_edges(u)))
        sv = set(map(lambda e: e[0], G.in_edges(v)))
        den = min(len(su), len(sv))
        if den == 0:
            return 0
        return len(su & sv) / den

    def predict_out(u, v):
        su = set(map(lambda e: e[1], G.out_edges(u)))
        sv = set(map(lambda e: e[1], G.out_edges(v)))
        den = min(len(su), len(sv))
        if den == 0:
            return 0
        return len(su & sv) / den

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def adamic_adar_index(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the Adamic-Adar index between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the Adamic-Adar index of nodes 'u' and 'v' is defined as:

    .. math:: \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{\log |\Gamma(w)|}

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        \sum_{w \in \Gamma_i(u) \cap \Gamma_i(v)} \frac{1}{\log |\Gamma_i(w)|}

        \sum_{w \in \Gamma_o(u) \cap \Gamma_o(v)} \frac{1}{\log |\Gamma_o(w)|}

    """
    def predict(u, v):
        return sum(1 / math.log(G.degree(w)) for w in nx.common_neighbors(G, u, v))

    def predict_in(u, v):
        su = set(map(lambda e: e[0], G.in_edges(u)))
        sv = set(map(lambda e: e[0], G.in_edges(v)))
        inters = su & sv
        res = 0
        for w in inters:
            l = len(G.in_edges(w))
            if l > 1:
                res += 1 / math.log(l)
        return res

    def predict_out(u, v):
        su = set(map(lambda e: e[1], G.out_edges(u)))
        sv = set(map(lambda e: e[1], G.out_edges(v)))
        inters = su & sv
        res = 0
        for w in inters:
            l = len(G.out_edges(w))
            if l > 1:
                res += 1 / math.log(l)
        return res

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def resource_allocation_index(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the resource allocation index between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the resource allocation index of nodes 'u' and 'v' is defined as:

    .. math:: \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{| \Gamma(w) |}

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        \sum_{w \in \Gamma_i(u) \cap \Gamma_i(v)} \frac{1}{|\Gamma_i(w)|}

        \sum_{w \in \Gamma_o(u) \cap \Gamma_o(v)} \frac{1}{|\Gamma_o(w)|}

    """
    def predict(u, v):
        return sum(1 / G.degree(w) for w in nx.common_neighbors(G, u, v))

    def predict_in(u, v):
        su = set(map(lambda e: e[0], G.in_edges(u)))
        sv = set(map(lambda e: e[0], G.in_edges(v)))
        inters = su & sv
        res = 0
        for w in inters:
            l = len(G.in_edges(w))
            if l > 1:
                res += 1 / l
        return res

    def predict_out(u, v):
        su = set(map(lambda e: e[1], G.out_edges(u)))
        sv = set(map(lambda e: e[1], G.out_edges(v)))
        inters = su & sv
        res = 0
        for w in inters:
            l = len(G.out_edges(w))
            if l > 1:
                res += 1 / l
        return res

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def preferential_attachment(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes the preferential attachment score between all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Can be computed for directed and undirected graphs (see Notes for exact definitions).

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.

    Notes
    -----
    For undirected graphs the preferential attachment score of nodes 'u' and 'v' is defined as:

    .. math:: |\Gamma(u)| |\Gamma(v)|

    For directed graphs we can consider either the in or the out-neighbourhoods, respectively:

    .. math::

        |\Gamma_i(u)| |\Gamma_i(v)|

        |\Gamma_o(u)| |\Gamma_o(v)|

    """
    def predict(u, v):
        return G.degree(u) * G.degree(v)

    def predict_in(u, v):
        return len(G.in_edges(u)) * len(G.in_edges(v))

    def predict_out(u, v):
        return len(G.out_edges(u)) * len(G.out_edges(v))

    # Select the appropriate function and return the results
    if G.is_directed():
        if neighbourhood == 'in':
            return _apply_prediction(G, predict_in, ebunch)
        elif neighbourhood == 'out':
            return _apply_prediction(G, predict_out, ebunch)
        else:
            raise ValueError("Unknown parameter value.")
    return _apply_prediction(G, predict, ebunch)


[docs]def random_prediction(G, ebunch=None, neighbourhood='in'):
    r"""
    Returns a float drawn uniformly at random from the interval (0.0, 1.0] for all node pairs in ebunch; or all nodes
    in G, if ebunch is None. Can be computed for directed and undirected graphs.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        Not used.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.
    """
    def predict(u, v):
        return 1 if random.random() > 0.5 else 0

    return _apply_prediction(G, predict, ebunch)


[docs]def all_baselines(G, ebunch=None, neighbourhood='in'):
    r"""
    Computes a 5-dimensional embedding for all node pairs in ebunch; or all nodes in G, if ebunch is None.
    Each of the 5 dimensions correspond to the similarity between the nodes as computed by a different function
    (i.e. CN, JC, AA, RAI and PA). Can be computed for directed and undirected graphs.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.
    neighbourhood : string, optional
        For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used.
        Default is 'in'.

    Returns
    -------
    emb : ndarray
        Column vector containing node-pair embeddings as rows.

    Raises
    ------
    ValueError
        If G is directed and neighbourhood is not one of 'in' or 'out'.
    """
    if ebunch is None:
        ebunch = list(G.edges)
    emb = np.zeros((len(ebunch), 5))
    for i in range(len(ebunch)):
        emb[i][0] = common_neighbours(G, [ebunch[i]], neighbourhood)[0]
        emb[i][1] = jaccard_coefficient(G, [ebunch[i]], neighbourhood)[0]
        emb[i][2] = adamic_adar_index(G, [ebunch[i]], neighbourhood)[0]
        emb[i][3] = resource_allocation_index(G, [ebunch[i]], neighbourhood)[0]
        emb[i][4] = preferential_attachment(G, [ebunch[i]], neighbourhood)[0]
    return emb