Source code for evalne.methods.katz

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Mara Alexandru Cristian
# Contact: alexandru.mara@ugent.be
# Date: 18/12/2018

# This file provides two different implementations of the katz score. The first, computes the exact score for the
# complete graph using the adjacency matrix. The second, computes the approximated Katz score for each input node-pair.
# Only undirected Graphs and Digraphs are supported.

import networkx as nx
import numpy as np

__all__ = ['Katz', 'KatzApprox']


[docs]class Katz(object): """ Computes the exact katz similarity based on paths between nodes in the graph. Shorter paths will contribute more than longer ones. This contribution depends of the damping factor 'beta'. The exact katz score is computed using the adj matrix of the full graph. Parameters ---------- G : graph A NetworkX graph or digraph with nodes being consecutive integers starting at 0. beta = float, optional The damping factor for the model. Default is 0.005. Notes ----- The execution is based on dense matrices, so it may run out of memory. """ def __init__(self, G, beta=0.005): self._G = G self.beta = beta self.sim = self._fit() def _fit(self): # Version using sparse matrices # adj = nx.adjacency_matrix(self._G, nodelist=range(len(self._G.nodes))) # ident = sparse.identity(len(self._G.nodes)).tocsc() # sim = inv(ident - adj.multiply(self.beta).T) - ident # adj = nx.adjacency_matrix(self._G, nodelist=range(len(self._G.nodes))) # aux = adj.multiply(-self.beta).T # aux.setdiag(1+aux.diagonal(), k=0) # sim = inv(aux) # sim.setdiag(sim.diagonal()-1) # print(sim.nnz) # print(adj.nnz) # Version using dense matrices adj = nx.adjacency_matrix(self._G, nodelist=range(len(self._G.nodes))) aux = adj.T.multiply(-self.beta).todense() np.fill_diagonal(aux, 1+aux.diagonal()) sim = np.linalg.inv(aux) np.fill_diagonal(sim, sim.diagonal()-1) return sim
[docs] def predict(self, ebunch): """ Computes the katz score for all node-pairs in ebunch. Parameters ---------- ebunch : iterable An iterable of node-pairs for which to compute the katz score. Returns ------- ndarray An array containing the similarity scores. """ ebunch = np.array(ebunch) return np.array(self.sim[ebunch[:, 0], ebunch[:, 1]]).flatten()
[docs] def save_sim_matrix(self, filename): """ Stores the similarity matrix to a file with the given name. Parameters ---------- filename : string The name and path of the file where the similarity matrix should be stored. """ np.savetxt(filename, self.sim, delimiter=',', fmt='%d')
[docs] def get_params(self): """ Returns a dictionary of model parameters. Returns ------- params : dict A dictionary of model parameters and their values. """ params = {'beta': self.beta} return params
[docs]class KatzApprox(object): """ Computes the approximated katz similarity based on paths between nodes in the graph. Shorter paths will contribute more than longer ones. This contribution depends of the damping factor 'beta'. The approximated score is computed using all paths between nodes of length at most 'path_len'. Parameters ---------- G : graph A NetworkX graph or digraph. beta : float, optional The damping factor for the model. Default is 0.005. path_len : int, optional The maximum path length to consider between each pair of nodes. Default is 3. Notes ----- The implementation follows the indication in [1]. It becomes extremely slow for large dense graphs. References ---------- .. [1] R. Laishram "Link Prediction in Dynamic Weighted and Directed Social Network using Supervised Learning" Dissertations - ALL. 355. 2015 """ def __init__(self, G, beta=0.005, path_len=3): self._G = G self.beta = beta self.path_len = path_len
[docs] def fit_predict(self, ebunch): """ Computes the katz score for all node-pairs in ebunch. Parameters ---------- ebunch : iterable An iterable of node-pairs for which to compute the katz score. Returns ------- ndarray An array containing the similarity scores. """ res = list() betas = np.zeros(self.path_len) for i in range(len(betas)): betas[i] = np.power(self.beta, i+1) for u, v in ebunch: paths = np.zeros(self.path_len) for path in nx.all_simple_paths(self._G, source=u, target=v, cutoff=self.path_len): paths[len(path)-2] += 1 # Simple paths output at most path_len+1, plus -1 because indexing at 0 res.append(np.sum(betas * paths)) return np.array(res).reshape(-1, 1)
[docs] def get_params(self): """ Returns a dictionary of model parameters. Returns ------- params : dict A dictionary of model parameters and their values. """ params = {'beta': self.beta, 'path_len': self.path_len} return params