Source code for evalne.methods.similarity

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018

# This file provides implementations of a variety heuristics for computing node-pair similarities. These heuristics are
# commonly used as baselines for network embedding evaluation. All functions of the networkx.link_prediction module are
# reimplemented here and extended to directed graphs (following https://surface.syr.edu/etd/355/).
# MultiGraphs and weighted graphs are not supported.

# TODO: the apply_prediction method should probably return a numpy array (like edge_embeddings does) rather than a list.

import math
import random
import numpy as np
import networkx as nx

__all__ = ['common_neighbours',
           'jaccard_coefficient',
           'cosine_similarity',
           'lhn_index',
           'topological_overlap',
           'adamic_adar_index',
           'resource_allocation_index',
           'preferential_attachment',
           'random_prediction',
           'all_baselines']


def _apply_prediction(G, func, ebunch=None):
    r"""
    Applies the given function to each node-pair in ebunch, if this is provided, otherwise, it applies the function to
    all edges in G.

    Parameters
    ----------
    G : graph
        A NetworkX graph or digraph.
    func : func
        A function on two inputs each being a node in the graph. Can return anything, but it should return a value
        representing the likelihood of a "link" between the two nodes.
    ebunch : iterable, optional
        An iterable of node pairs. If None, all edges in G will be used. Default is None.

    Returns
    -------
    sim : list
        A list of node-pair similarities in the same order as ebunch.

    """
    if ebunch is None:
        ebunch = list(G.edges)
    return list(map(lambda e: func(e[0], e[1]), ebunch))


[docs]def common_neighbours(G, ebunch=None, neighbourhood='in'): r""" Computes the common neighbours similarity between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the common neighbours similarity of nodes 'u' and 'v' is defined as: .. math:: |\Gamma(u) \cap \Gamma(v)| For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: |\Gamma_i(u) \cap \Gamma_i(v)| |\Gamma_o(u) \cap \Gamma_o(v)| """ def predict(u, v): return len(set(G[u]) & set(G[v])) def predict_in(u, v): su = set(map(lambda e: e[0], G.in_edges(u))) sv = set(map(lambda e: e[0], G.in_edges(v))) return len(su & sv) def predict_out(u, v): su = set(map(lambda e: e[1], G.out_edges(u))) sv = set(map(lambda e: e[1], G.out_edges(v))) return len(su & sv) # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def jaccard_coefficient(G, ebunch=None, neighbourhood='in'): r""" Computes the Jaccard coefficient between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the Jaccard coefficient of nodes 'u' and 'v' is defined as: .. math:: |\Gamma(u) \cap \Gamma(v)| / |\Gamma(u) \cup \Gamma(v)| For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{|\Gamma_i(u) \cup \Gamma_i(v)|} \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{|\Gamma_o(u) \cup \Gamma_o(v)|} """ def predict(u, v): union_size = len(set(G[u]) | set(G[v])) if union_size == 0: return 0 return len(list(nx.common_neighbors(G, u, v))) / union_size def predict_in(u, v): su = set(map(lambda e: e[0], G.in_edges(u))) sv = set(map(lambda e: e[0], G.in_edges(v))) union_size = len(su | sv) if union_size == 0: return 0 return len(su & sv) / union_size def predict_out(u, v): su = set(map(lambda e: e[1], G.out_edges(u))) sv = set(map(lambda e: e[1], G.out_edges(v))) union_size = len(su | sv) if union_size == 0: return 0 return len(su & sv) / union_size # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def cosine_similarity(G, ebunch=None, neighbourhood='in'): r""" Computes the cosine similarity between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the cosine similarity of nodes 'u' and 'v' is defined as: .. math:: \frac{|\Gamma(u) \cap \Gamma(v)|}{\sqrt{|\Gamma(u)| |\Gamma(v)|}} For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{\sqrt{|\Gamma_i(u)| |\Gamma_i(v)|}} \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{\sqrt{|\Gamma_o(u)| |\Gamma_o(v)|}} """ def predict(u, v): den = math.sqrt(len(set(G[u])) * len(set(G[v]))) if den == 0: return 0 return len(list(nx.common_neighbors(G, u, v))) / den def predict_in(u, v): su = set(map(lambda e: e[0], G.in_edges(u))) sv = set(map(lambda e: e[0], G.in_edges(v))) den = math.sqrt(len(su) * len(sv)) if den == 0: return 0 return len(su & sv) / den def predict_out(u, v): su = set(map(lambda e: e[1], G.out_edges(u))) sv = set(map(lambda e: e[1], G.out_edges(v))) den = math.sqrt(len(su) * len(sv)) if den == 0: return 0 return len(su & sv) / den # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def lhn_index(G, ebunch=None, neighbourhood='in'): r""" Computes the Leicht-Holme-Newman index [1]_ between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the Leicht-Holme-Newman index of nodes 'u' and 'v' is defined as: .. math:: \frac{|\Gamma(u) \cap \Gamma(v)|}{|\Gamma(u)| |\Gamma(v)|} For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{|\Gamma_i(u)| |\Gamma_i(v)|} \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{|\Gamma_o(u)| |\Gamma_o(v)|} References ---------- .. [1] Leicht, E. A. and Holme, Petter and Newman, M. E. J. (2006). "Vertex similarity in networks.", Phys. Rev. E, 73, 10.1103/PhysRevE.73.026120. """ def predict(u, v): den = G.degree(u) * G.degree(v) if den == 0: return 0 return len(list(nx.common_neighbors(G, u, v))) / den def predict_in(u, v): su = set(map(lambda e: e[0], G.in_edges(u))) sv = set(map(lambda e: e[0], G.in_edges(v))) den = len(su) * len(sv) if den == 0: return 0 return len(su & sv) / den def predict_out(u, v): su = set(map(lambda e: e[1], G.out_edges(u))) sv = set(map(lambda e: e[1], G.out_edges(v))) den = len(su) * len(sv) if den == 0: return 0 return len(su & sv) / den # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def topological_overlap(G, ebunch=None, neighbourhood='in'): r""" Computes the topological overlap [2]_ between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the topological overlap of nodes 'u' and 'v' is defined as: .. math:: \frac{|\Gamma(u) \cap \Gamma(v)|}{min(|\Gamma(u)|,|\Gamma(v)|)} For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: \frac{|\Gamma_i(u) \cap \Gamma_i(v)|}{min(|\Gamma_i(u)|,|\Gamma_i(v)|)} \frac{|\Gamma_o(u) \cap \Gamma_o(v)|}{min(|\Gamma_o(u)|,|\Gamma_o(v)|)} References ---------- .. [2] Ravasz, E., Somera, A. L., Mongru, D. A., Oltvai, Z. N., & Barabási, A. L. (2002). "Hierarchical organization of modularity in metabolic networks." Science, 297(5586), 1551-1555. """ def predict(u, v): den = min(G.degree(u), G.degree(v)) if den == 0: return 0 return len(list(nx.common_neighbors(G, u, v))) / den def predict_in(u, v): su = set(map(lambda e: e[0], G.in_edges(u))) sv = set(map(lambda e: e[0], G.in_edges(v))) den = min(len(su), len(sv)) if den == 0: return 0 return len(su & sv) / den def predict_out(u, v): su = set(map(lambda e: e[1], G.out_edges(u))) sv = set(map(lambda e: e[1], G.out_edges(v))) den = min(len(su), len(sv)) if den == 0: return 0 return len(su & sv) / den # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def adamic_adar_index(G, ebunch=None, neighbourhood='in'): r""" Computes the Adamic-Adar index between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the Adamic-Adar index of nodes 'u' and 'v' is defined as: .. math:: \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{\log |\Gamma(w)|} For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: \sum_{w \in \Gamma_i(u) \cap \Gamma_i(v)} \frac{1}{\log |\Gamma_i(w)|} \sum_{w \in \Gamma_o(u) \cap \Gamma_o(v)} \frac{1}{\log |\Gamma_o(w)|} """ def predict(u, v): return sum(1 / math.log(G.degree(w)) for w in nx.common_neighbors(G, u, v)) def predict_in(u, v): su = set(map(lambda e: e[0], G.in_edges(u))) sv = set(map(lambda e: e[0], G.in_edges(v))) inters = su & sv res = 0 for w in inters: l = len(G.in_edges(w)) if l > 1: res += 1 / math.log(l) return res def predict_out(u, v): su = set(map(lambda e: e[1], G.out_edges(u))) sv = set(map(lambda e: e[1], G.out_edges(v))) inters = su & sv res = 0 for w in inters: l = len(G.out_edges(w)) if l > 1: res += 1 / math.log(l) return res # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def resource_allocation_index(G, ebunch=None, neighbourhood='in'): r""" Computes the resource allocation index between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the resource allocation index of nodes 'u' and 'v' is defined as: .. math:: \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{| \Gamma(w) |} For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: \sum_{w \in \Gamma_i(u) \cap \Gamma_i(v)} \frac{1}{|\Gamma_i(w)|} \sum_{w \in \Gamma_o(u) \cap \Gamma_o(v)} \frac{1}{|\Gamma_o(w)|} """ def predict(u, v): return sum(1 / G.degree(w) for w in nx.common_neighbors(G, u, v)) def predict_in(u, v): su = set(map(lambda e: e[0], G.in_edges(u))) sv = set(map(lambda e: e[0], G.in_edges(v))) inters = su & sv res = 0 for w in inters: l = len(G.in_edges(w)) if l > 1: res += 1 / l return res def predict_out(u, v): su = set(map(lambda e: e[1], G.out_edges(u))) sv = set(map(lambda e: e[1], G.out_edges(v))) inters = su & sv res = 0 for w in inters: l = len(G.out_edges(w)) if l > 1: res += 1 / l return res # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def preferential_attachment(G, ebunch=None, neighbourhood='in'): r""" Computes the preferential attachment score between all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs (see Notes for exact definitions). Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. Notes ----- For undirected graphs the preferential attachment score of nodes 'u' and 'v' is defined as: .. math:: |\Gamma(u)| |\Gamma(v)| For directed graphs we can consider either the in or the out-neighbourhoods, respectively: .. math:: |\Gamma_i(u)| |\Gamma_i(v)| |\Gamma_o(u)| |\Gamma_o(v)| """ def predict(u, v): return G.degree(u) * G.degree(v) def predict_in(u, v): return len(G.in_edges(u)) * len(G.in_edges(v)) def predict_out(u, v): return len(G.out_edges(u)) * len(G.out_edges(v)) # Select the appropriate function and return the results if G.is_directed(): if neighbourhood == 'in': return _apply_prediction(G, predict_in, ebunch) elif neighbourhood == 'out': return _apply_prediction(G, predict_out, ebunch) else: raise ValueError("Unknown parameter value.") return _apply_prediction(G, predict, ebunch)
[docs]def random_prediction(G, ebunch=None, neighbourhood='in'): r""" Returns a float drawn uniformly at random from the interval (0.0, 1.0] for all node pairs in ebunch; or all nodes in G, if ebunch is None. Can be computed for directed and undirected graphs. Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional Not used. Returns ------- sim : list A list of node-pair similarities in the same order as ebunch. """ def predict(u, v): return 1 if random.random() > 0.5 else 0 return _apply_prediction(G, predict, ebunch)
[docs]def all_baselines(G, ebunch=None, neighbourhood='in'): r""" Computes a 5-dimensional embedding for all node pairs in ebunch; or all nodes in G, if ebunch is None. Each of the 5 dimensions correspond to the similarity between the nodes as computed by a different function (i.e. CN, JC, AA, RAI and PA). Can be computed for directed and undirected graphs. Parameters ---------- G : graph A NetworkX graph or digraph. ebunch : iterable, optional An iterable of node pairs. If None, all edges in G will be used. Default is None. neighbourhood : string, optional For directed graphs only. Determines if the in or the out-neighbourhood of nodes should be used. Default is 'in'. Returns ------- emb : ndarray Column vector containing node-pair embeddings as rows. Raises ------ ValueError If G is directed and neighbourhood is not one of 'in' or 'out'. """ if ebunch is None: ebunch = list(G.edges) emb = np.zeros((len(ebunch), 5)) for i in range(len(ebunch)): emb[i][0] = common_neighbours(G, [ebunch[i]], neighbourhood)[0] emb[i][1] = jaccard_coefficient(G, [ebunch[i]], neighbourhood)[0] emb[i][2] = adamic_adar_index(G, [ebunch[i]], neighbourhood)[0] emb[i][3] = resource_allocation_index(G, [ebunch[i]], neighbourhood)[0] emb[i][4] = preferential_attachment(G, [ebunch[i]], neighbourhood)[0] return emb