Source code for grakel.kernels.pyramid_match

"""The pyramid match kernel as in :cite:`nikolentzos2017matching`."""
# Author: Ioannis Siglidis <y.siglidis@gmail.com>
# License: BSD 3 clause
import collections
import warnings

import numpy as np

from itertools import chain

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigs

from grakel.graph import Graph
from grakel.kernels import Kernel

# Python 2/3 cross-compatibility import
from six import itervalues
from six import iteritems


[docs]class PyramidMatch(Kernel): """Pyramid match kernel class. Kernel defined in :cite:`nikolentzos2017matching` Parameters ---------- with_labels : bool, default=True A flag that determines if the kernel computation will consider labels. L : int, default=4 Pyramid histogram level. d : int, default=6 The dimension of the hypercube. Attributes ---------- _num_labels : int The number of distinct labels, on the fit data. _labels : dict A dictionary of label enumeration, made from fitted data. """ _graph_format = "adjacency"
[docs] def __init__(self, n_jobs=None, normalize=False, verbose=False, with_labels=True, L=4, d=6): """Initialise a `pyramid_match` kernel.""" super(PyramidMatch, self).__init__(n_jobs=n_jobs, normalize=normalize, verbose=verbose) self.with_labels = with_labels self.L = L self.d = d self._initialized.update({"d": False, "L": False, "with_labels": False})
[docs] def initialize(self): """Initialize all transformer arguments, needing initialization.""" super(PyramidMatch, self).initialize() if not self._initialized["with_labels"]: if type(self.with_labels) != bool: raise TypeError('with labels must be a boolean variable') self._initialized["with_labels"] = True if not self._initialized["L"]: if type(self.L) is not int or self.L < 0: raise TypeError('L: the number of levels must be an integer ' 'bigger equal to 0') self._initialized["L"] = True if not self._initialized["d"]: if type(self.d) is not int or self.d < 1: raise TypeError('d: hypercube dimension must be an ' 'integer bigger than 1') self._initialized["d"] = True
[docs] def parse_input(self, X): """Parse and create features for pyramid_match kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- H : list A list of lists of Histograms for all levels for each graph. """ if not isinstance(X, collections.Iterable): raise TypeError('input must be an iterable\n') else: i = 0 Us = [] if self.with_labels: Ls = [] for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, collections.Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or (len(x) >= 1 and not self.with_labels) or (len(x) >= 2 and self.with_labels)): if len(x) == 0: warnings.warn('Ignoring empty element on index: ' + str(idx)) continue elif not self.with_labels: x = Graph(x[0], {}, {}, self._graph_format) else: x = Graph(x[0], x[1], {}, self._graph_format) elif not type(x) is Graph: raise TypeError('each element of X must be either a graph object or a list with ' 'at least a graph like object and node labels dict \n') A = x.get_adjacency_matrix() if self.with_labels: L = x.get_labels(purpose="adjacency") i += 1 if A.shape[0] == 0: Us.append(np.zeros((1, self.d))) else: # Perform eigenvalue decomposition. # Rows of matrix U correspond to vertex representations # Embed vertices into the d-dimensional space if A.shape[0] > self.d+1: # If size of graph smaller than d, pad with zeros Lambda, U = eigs(csr_matrix(A, dtype=np.float), k=self.d, ncv=10*self.d) idx = Lambda.argsort()[::-1] U = U[:, idx] else: Lambda, U = np.linalg.eig(A) idx = Lambda.argsort()[::-1] U = U[:, idx] U = U[:, :self.d] # Replace all components by their absolute values U = np.absolute(U) Us.append((A.shape[0], U)) if self.with_labels: Ls.append(L) if i == 0: raise ValueError('parsed input is empty') if self.with_labels: # Map labels to values between 0 and |L|-1 # where |L| is the number of distinct labels if self._method_calling in [1, 2]: self._num_labels = 0 self._labels = set() for L in Ls: self._labels |= set(itervalues(L)) self._num_labels = len(self._labels) self._labels = {l: i for (i, l) in enumerate(self._labels)} return self._histogram_calculation(Us, Ls, self._labels) elif self._method_calling == 3: labels = set() for L in Ls: labels |= set(itervalues(L)) rest_labels = labels - set(self._labels.keys()) nouveau_labels = dict(chain(iteritems(self._labels), ((j, i) for (i, j) in enumerate(rest_labels, len(self._labels))))) return self._histogram_calculation(Us, Ls, nouveau_labels) else: return self._histogram_calculation(Us)
def _histogram_calculation(self, Us, *args): """Calculate histograms. Parameters ---------- Us : list List of tuples with the first element corresponding to the number of vertices of a graph and the second to it's corresponding to vertex embeddings on the d-dimensional space. Ls : list, optional List of labels corresponding to each graph. If provided the histograms are calculated with labels. Labels : dict, optional A big dictionary with enumeration of labels. Returns ------- Hs : list List of histograms for each graph. """ Hs = list() if len(args) == 0: for (i, (n, u)) in enumerate(Us): du = list() if n > 0: for j in range(self.L): # Number of cells along each dimension at level j k = 2**j # Determines the cells in which each vertex lies # along each dimension since nodes lie in the unit # hypercube in R^d D = np.zeros((self.d, k)) T = np.floor(u*k) T[np.where(T == k)] = k-1 for p in range(u.shape[0]): if p >= n: break for q in range(u.shape[1]): # Identify the cell into which the i-th # vertex lies and increase its value by 1 D[q, int(T[p, q])] += 1 du.append(D) Hs.append(du) elif len(args) > 0: Ls = args[0] Labels = args[1] num_labels = len(Labels) for (i, ((n, u), L)) in enumerate(zip(Us, Ls)): du = list() if n > 0: for j in range(self.L): # Number of cells along each dimension at level j k = 2**j # To store the number of vertices that are assigned # a specific label and lie in each of the 2^j cells # of each dimension at level j D = np.zeros((self.d*num_labels, k)) T = np.floor(u*k) T[np.where(T == k)] = k-1 for p in range(u.shape[0]): if p >= n: break for q in range(u.shape[1]): # Identify the cell into which the i-th # vertex lies and increase its value by 1 D[Labels[L[p]]*self.d + q, int(T[p, q])] += 1 du.append(D) Hs.append(du) return Hs
[docs] def pairwise_operation(self, x, y): """Calculate a pairwise kernel between two elements. Parameters ---------- x, y : dict Histograms as produced by `parse_input`. Returns ------- kernel : number The kernel value. """ k = 0 if len(x) != 0 and len(y) != 0: intersec = np.zeros(self.L) for (p, xp, yp) in zip(range(self.L), x, y): # Calculate histogram intersection # (eq. 6 in :cite:`nikolentzos2017matching`) if xp.shape[0] < yp.shape[0]: xpp, ypp = xp, yp[:xp.shape[0], :] elif yp.shape[0] < xp.shape[0]: xpp, ypp = xp[:yp.shape[0], :], yp else: xpp, ypp = xp, yp intersec[p] = np.sum(np.minimum(xpp, ypp)) k += intersec[self.L-1] for p in range(self.L-1): # Computes the new matches that occur at level p. # These matches weight less than those that occur at # higher levels (e.g. p+1 level) k += (1.0/(2**(self.L-p-1)))*(intersec[p]-intersec[p+1]) return k