Source code for grakel.kernels.graph_hopper

"""The Graph Hopper kernel as defined in :cite:`feragen2013scalable`."""
# Author: Ioannis Siglidis <y.siglidis@gmail.com>
# License: BSD 3 clause
import numpy as np

from collections import defaultdict
from collections import Iterable
from numbers import Real
from warnings import warn
from numpy.matlib import repmat

from grakel.kernels import Kernel
from grakel.graph import Graph
from grakel.graph import dijkstra

# Python 2/3 cross-compatibility import
from six.moves import filterfalse


[docs]class GraphHopper(Kernel): """Graph Hopper Histogram kernel as found in :cite:`feragen2013scalable`. Parameters ---------- kernel_type : str, tuple or function For `kernel_type` of **type**: + **str** can either be 'linear', 'gaussian', 'bridge'. + **tuple** can be of the form ('gaussian', mu) where mu is a number. + **function** can be a function that takes two tuples of np.arrays for each graph corresponding to the M matrix and the attribute matrix and returns a number. Attributes ---------- metric_ : function The base metric applied between features. calculate_norm_ : bool Defines if the norm of the attributes will be calculated (in order to avoid recalculation when using it with e.g. gaussian). """ _graph_format = "all"
[docs] def __init__(self, n_jobs=None, normalize=False, verbose=False, kernel_type='linear'): """Initialize an Graph Hopper kernel.""" super(GraphHopper, self).__init__(n_jobs=n_jobs, normalize=normalize, verbose=verbose) self.kernel_type = kernel_type self._initialized.update({"kernel_type": False})
[docs] def initialize(self): """Initialize all transformer arguments, needing initialization.""" super(GraphHopper, self).initialize() if not self._initialized["kernel_type"]: if type(self.kernel_type) is str: if self.kernel_type == "linear": self.metric_ = linear_kernel self.calculate_norm_ = False elif self.kernel_type == "gaussian": self.metric_ = lambda x, y: gaussian_kernel(x, y, 1) self.calculate_norm_ = True elif self.kernel_type == "bridge": self.metric_ = bridge_kernel self.calculate_norm_ = False else: raise ValueError('Unsupported kernel with name "' + str(self.kernel_type) + '"') elif (type(self.kernel_type) is tuple and len(self.kernel_type) == 2 and self.kernel_type[0] == "gaussian" and isinstance(self.kernel_type[1], Real)): self.metric_ = lambda x, y: gaussian_kernel(x, y, self.kernel_type[1]) self.calculate_norm_ = True elif callable(self.kernel_type): self.metric_ = self._kernel_type self.calculate_norm_ = False else: raise TypeError('Unrecognized "kernel_type": can either be a str ' 'from the supported: "linear", "gaussian", "bridge" ' 'or tuple ("gaussian", mu) or a callable.')
[docs] def parse_input(self, X): """Parse and check the given input for the Graph Hopper kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). Returns ------- out : np.array, shape=(len(X), n_labels) A np array for frequency (cols) histograms for all Graphs (rows). """ if not isinstance(X, Iterable): raise TypeError('input must be an iterable\n') else: ni = 0 diam = list() graphs = list() for (i, x) in enumerate(iter(X)): is_iter = False if isinstance(x, Iterable): is_iter = True x = list(x) if type(x) is Graph: g = Graph(x.get_adjacency_matrix(), x.get_labels(purpose="adjacency"), {}, self._graph_format) elif is_iter and len(x) == 0 or len(x) >= 2: if len(x) == 0: warn('Ignoring empty element on index: '+str(i)) continue elif len(x) >= 2: g = Graph(x[0], x[1], {}, "adjacency") g.change_format(self._graph_format) else: raise TypeError('each element of X must be either a ' 'graph object or a list with at least ' 'a graph like object and node, ') spm, attr = g.build_shortest_path_matrix(labels="vertex") nv = g.nv() try: attributes = np.array([attr[j] for j in range(nv)]) except TypeError: raise TypeError('All attributes of a single graph should have the same dimension.') diam.append(int(np.max(spm[spm < float("Inf")]))) graphs.append((g.get_adjacency_matrix(), nv, attributes)) ni += 1 if self._method_calling == 1: max_diam = self._max_diam = max(diam) + 1 else: max_diam = max(self._max_diam, max(diam) + 1) out = list() for i in range(ni): AM, node_nr, attributes = graphs[i] des = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int) occ = np.zeros(shape=(node_nr, node_nr, max_diam), dtype=int) # Convert adjacency matrix to dictionary idx_i, idx_j = np.where(AM > 0) ed = defaultdict(dict) for (a, b) in filterfalse(lambda a: a[0] == a[1], zip(idx_i, idx_j)): ed[a][b] = AM[a, b] for j in range(node_nr): A = np.zeros(shape=AM.shape) # Single-source shortest path from node j D, p = dijkstra(ed, j) D = np.array(list(D.get(k, float("Inf")) for k in range(node_nr))) p[j] = -1 # Restrict to the connected component of node j conn_comp = np.where(D < float("Inf"))[0] # To-be DAG adjacency matrix of connected component of node j A_cc = A[conn_comp, :][:, conn_comp] # Adjacency matrix of connected component of node j AM_cc = AM[conn_comp, :][:, conn_comp] D_cc = D[conn_comp] conn_comp_converter = np.zeros(shape=(A.shape[0], 1), dtype=int) for k in range(conn_comp.shape[0]): conn_comp_converter[conn_comp[k]] = k conn_comp_converter = np.vstack([0, conn_comp_converter]) p_cc = conn_comp_converter[np.array(list(p[k] for k in conn_comp)) + 1] # Number of nodes in connected component of node j conncomp_node_nr = A_cc.shape[0] for v in range(conncomp_node_nr): if p_cc[v] > 0: # Generate A_cc by adding directed edges of form (parent(v), v) A_cc[p_cc[v], v] = 1 # Distance from v to j v_dist = D_cc[v] # All neighbors of v in the undirected graph v_nbs = np.where(AM_cc[v, :] > 0)[0] # Distances of neighbors of v to j v_nbs_dists = D_cc[v_nbs] # All neighbors of v in undirected graph who are # one step closer to j than v is; i.e. SP-DAG parents v_parents = v_nbs[v_nbs_dists == (v_dist - 1)] # Add SP-DAG parents to A_cc A_cc[v_parents, v] = 1 # Computes the descendants & occurence vectors o_j(v), d_j(v) # for all v in the connected component occ_p, des_p = od_vectors_dag(A_cc, D_cc) if des_p.shape[0] == 1 and j == 0: des[j, 0, 0] = des_p occ[j, 0, 0] = occ_p else: # Convert back to the indices of the original graph for v in range(des_p.shape[0]): for l in range(des_p.shape[1]): des[j, conn_comp[v], l] = des_p[v, l] # Convert back to the indices of the original graph for v in range(occ_p.shape[0]): for l in range(occ_p.shape[1]): occ[j, conn_comp[v], l] = occ_p[v, l] M = np.zeros(shape=(node_nr, max_diam, max_diam)) # j loops through choices of root for j in range(node_nr): des_mat_j_root = np.squeeze(des[j, :, :]) occ_mat_j_root = np.squeeze(occ[j, :, :]) # v loops through nodes for v in range(node_nr): for a in range(max_diam): for b in range(a, max_diam): # M[v,:,:] is M[v]; a = node coordinate in path, b = path length M[v, a, b] += des_mat_j_root[v, b - a]*occ_mat_j_root[v, a] if self.calculate_norm_: out.append((M, attributes, np.sum(attributes ** 2, axis=1))) else: out.append((M, attributes)) return out
[docs] def pairwise_operation(self, x, y): """Graph Hopper kernel as proposed in :cite:`feragen2013scalable`. Parameters ---------- x, y : tuple Extracted features from `parse_input`. Returns ------- kernel : number The kernel value. """ xp, yp = x[0], y[0] m = min(xp.shape[1], yp.shape[1]) m_sq = m**2 if x[0].shape[1] > m: xp = xp[:, :m, :][:, :, :m] elif y[0].shape[1] > m: yp = yp[:, :m, :][:, :, :m] return self.metric_((xp.reshape(xp.shape[0], m_sq),) + x[1:], (yp.reshape(yp.shape[0], m_sq),) + y[1:])
def linear_kernel(x, y): """Graph Hopper linear pairwise kernel as proposed in :cite:`feragen2013scalable`. Parameters ---------- x, y : tuple Extracted features from `parse_input`. Returns ------- kernel : number The kernel value. """ M_i, NA_i = x M_j, NA_j = y weight_matrix = np.dot(M_i, M_j.T) NA_linear_kernel = np.dot(NA_i, NA_j.T) return np.dot(weight_matrix.flat, NA_linear_kernel.flat) def gaussian_kernel(x, y, mu): """Graph Hopper gaussian pairwise kernel as proposed in :cite:`feragen2013scalable`. Parameters ---------- x, y : tuple Extracted features from `parse_input`. mu : Number The mean value of the gaussian. Returns ------- kernel : number The kernel value. """ M_i, NA_i, norm2_i = x M_j, NA_j, norm2_j = y weight_matrix = np.dot(M_i, M_j.T) NA_linear_kernel = np.dot(NA_i, NA_j.T) NA_squared_distmatrix = ((-2*NA_linear_kernel.T + norm2_i).T + norm2_j) nodepair = np.exp(-mu*NA_squared_distmatrix) return np.dot(weight_matrix.flat, nodepair.flat) def bridge_kernel(x, y): """Graph Hopper bridge kernel as proposed in :cite:`feragen2013scalable`. Parameters ---------- x, y : tuple Extracted features from `parse_input`. Returns ------- kernel : number The kernel value. """ M_i, NA_i = x M_j, NA_j = y weight_matrix = np.dot(M_i, M_j.T) NAs = np.vstack([NA_i, NA_j]) NAs_linear_kernel = np.dot(NAs, NAs.T) NAs_distances = kernelmatrix2distmatrix(NAs_linear_kernel) NA_i_NA_j_distances = NAs_distances[:NA_i.shape[0], NA_i.shape[0]:] nodepair = (4-NA_i_NA_j_distances)/4 nodepair[nodepair < 0] = 0 return np.dot(weight_matrix.flat, nodepair.flat) def kernelmatrix2distmatrix(K): """Convert a Kernel Matrix to a Distance Matrix. Parameters ---------- K : np.array, n_dim=2 The kernel matrix. Returns ------- D : np.array, n_dim=2 The distance matrix. """ diag_K = K.diagonal().reshape(K.shape[0], 1) return np.sqrt(diag_K + diag_K.T - 2*K) def od_vectors_dag(G, shortestpath_dists): """Compute the set of occurrence and distance vectors for G. Defined in :cite:`feragen2013scalable`. Parameters ---------- G : np.array, n_dim=2 DAG induced from a gappy tree where the indexing of nodes gives a breadth first order of the corresponding original graph shortestpath_dists : np.array, n_dim=1 Shortest path distances from the source node. Returns ------- occ : np.array, n_dim=2 n x d descendant matrix occ, where n: `G.shape[0]` loops through the nodes of G, and d: 'diameter of G'. The rows of the occ matrix will be padded with zeros on the right. des : np.array, n_dim=2 n x d descendant matrix des, where n: `G.shape[0]` loops through the nodes of G, and d: 'diameter of G'. The rows of the des matrix will be padded with zeros on the right. """ dag_size = G.shape[0] DAG_gen_vector = shortestpath_dists + 1 # This only works when the DAG is a shortest path DAG on an unweighted graph gen_sorted = DAG_gen_vector.argsort() re_sorted = gen_sorted.argsort() sortedG = G[gen_sorted, :][:, gen_sorted] delta = int(np.max(DAG_gen_vector)) # Initialize: # For a node v at generation i in the tree, give it the vector # [0 0 ... 1 ... 0] of length h_tree with the 1 at the ith place. occ = np.zeros(shape=(dag_size, delta), dtype=int) occ[0, 0] = 1 # Initialize: # For a node v at generation i in the tree, give it the vector # [0 0 ... 1 ... 0] of length delta with the 1 at the ith place. des = np.zeros(shape=(dag_size, delta), dtype=int) des[:, 0] = np.ones(shape=(1, dag_size)) for i in range(dag_size): edges_starting_at_ith = np.where(np.squeeze(sortedG[i, :]) == 1)[0] occ[edges_starting_at_ith, :] = occ[edges_starting_at_ith, :] + \ repmat(np.hstack([0, occ[i, :-1]]), edges_starting_at_ith.shape[0], 1) # Now use message-passing from the bottom of the DAG to add up the # edges from each node. This is easy because the vertices in the DAG # are depth-first ordered in the original tree; thus, we can just start # from the end of the DAG matrix. edges_ending_at_ith_from_end = np.where(np.squeeze(sortedG[:, dag_size - i - 1]) == 1)[0] des[edges_ending_at_ith_from_end, :] = ( des[edges_ending_at_ith_from_end, :] + repmat(np.hstack([0, des[dag_size - i - 1, :-1]]), edges_ending_at_ith_from_end.shape[0], 1)) return occ[re_sorted, :], des[re_sorted, :] if __name__ == '__main__': from grakel.datasets import fetch_dataset import argparse # Create an argument parser for the installer of pynauty parser = argparse.ArgumentParser( description='Measuring classification accuracy ' ' on multiscale_laplacian_fast') parser.add_argument( '--dataset', help='choose the dataset you want the tests to be executed', type=str, default="BZR" ) parser.add_argument( '--full', help='fit_transform the full graph', action="store_true") mec = parser.add_mutually_exclusive_group() mec.add_argument( '--linear', help='choose a linear kernel', action="store_true") mec.add_argument( '--gaussian', help='choose a gaussian kernel (optionaly add a mu: default=1)', nargs='?', type=str, const='1', default=None) mec.add_argument( '--bridge', help='choose a bridge kernel', action="store_true") # Get the dataset name args = parser.parse_args() dataset_name = args.dataset if args.gaussian is not None: kernel_type = ('gaussian', float(args.gaussian)) elif bool(args.bridge): kernel_type = 'bridge' else: kernel_type = 'linear' full = bool(args.full) # The baseline dataset for node/edge-attributes dataset_attr = fetch_dataset(dataset_name, with_classes=True, prefer_attr_nodes=True, verbose=True) from tqdm import tqdm from time import time from sklearn.metrics import accuracy_score from sklearn.model_selection import KFold from sklearn import svm def sec_to_time(sec): """Print time in a correct format.""" dt = list() days = int(sec // 86400) if days > 0: sec -= 86400*days dt.append(str(days) + " d") hrs = int(sec // 3600) if hrs > 0: sec -= 3600*hrs dt.append(str(hrs) + " h") mins = int(sec // 60) if mins > 0: sec -= 60*mins dt.append(str(mins) + " m") if sec > 0: dt.append(str(round(sec, 2)) + " s") return " ".join(dt) # Loads the Mutag dataset from: # https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets # the biggest collection of benchmark datasets for graph_kernels. G, y = dataset_attr.data, dataset_attr.target C_grid = (10. ** np.arange(-7, 7, 2) / len(G)).tolist() stats = {"acc": list(), "time": list()} kf = KFold(n_splits=10, random_state=42, shuffle=True) niter = kf.get_n_splits(y) for (k, (train_index, test_index)) in tqdm(enumerate(kf.split(G, y)), total=niter): # Train-test split of graph data tri = train_index.tolist() tei = test_index.tolist() G_train, G_test = list(), list() y_train, y_test = list(), list() for (i, (g, t)) in enumerate(zip(G, y)): if len(tri) and i == tri[0]: G_train.append(g) y_train.append(t) tri.pop(0) elif len(tei) and i == tei[0]: G_test.append(g) y_test.append(t) tei.pop(0) start = time() gk = GraphHopper(normalize=True, kernel_type=kernel_type) # Calculate the kernel matrix. if full: K = gk.fit_transform(G) K_train = K[train_index, :][:, train_index] K_test = K[test_index, :][:, train_index] else: K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) end = time() # Cross validation on C, variable acc = 0 for c in C_grid: # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=c) # Fit on the train Kernel clf.fit(K_train, y_train) # Predict and test. y_pred = clf.predict(K_test) # Calculate accuracy of classification. acc = max(acc, accuracy_score(y_test, y_pred)) stats["acc"].append(acc) stats["time"].append(end-start) print("Mean values of", niter, "iterations:") print("GraphHopper", "> Accuracy:", str(round(np.mean(stats["acc"])*100, 2)), "% | Took:", sec_to_time(np.mean(stats["time"])))