Source code for grakel.kernels.odd_sth

"""The ODD-Sth kernel as defined in :cite:`Martino2012ATK`."""
# Author: Ioannis Siglidis <y.siglidis@gmail.com>
# License: BSD 3 clause
import copy
import warnings

import numpy as np

from collections import Iterable
from collections import defaultdict

from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

from grakel.kernels import Kernel
from grakel.graph import Graph

# Python 2/3 cross-compatibility import
from six import iteritems


[docs]class OddSth(Kernel): """ODD-Sth kernel as proposed in :cite:`Martino2012ATK`. Parameters ---------- h : int, default=None Maximum (single) dag height. If None there is no restriction. Attributes ---------- _nx : int The number of parsed inputs on fit. _ny : int The number of parsed inputs on transform. _phi_x : np.array, n_dim=2 A numpy array corresponding all the frequency values for each vertex, coresponding to the fitted data, in the resulting bigDAG of the fitted and transformed data. _phi_y : np.array, n_dim=2 A numpy array corresponding all the frequency values for each vertex, corresponding to the transformed data, in the resulting bigDAG of the fitted and transformed data. _C : np.array, n_dim=1 A numpy array corresponding to the vertex depth of each node, in the resulting bigDAG of the fitted and transformed data. """
[docs] def __init__(self, n_jobs=None, normalize=False, verbose=False, h=None): """Initialise an `odd_sth` kernel.""" super(OddSth, self).__init__(n_jobs=n_jobs, normalize=normalize, verbose=verbose) self.h = h self._initialized.update({"h": False})
[docs] def initialize(self): """Initialize all transformer arguments, needing initialization.""" if not self._initialized["n_jobs"]: if self.n_jobs is not None: warnings.warn('no implemented parallelization for OddSth') self._initialized["n_jobs"] = True if not self._initialized["h"]: if self.h is not None and (type(self.h) is not int or self.h <= 0): raise ValueError('h must be an integer bigger than zero') self.h_ = (-1 if self.h is None else self.h) self._initialized["h"] = True
[docs] def parse_input(self, X): """Parse and create features for the propagation kernel. Parameters ---------- X : iterable For the input to pass the test, we must have: Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that correspond to the given graph format). A valid input also consists of graph type objects. Returns ------- out : tuple A tuple corresponding to the calculated bigDAG. """ if not isinstance(X, Iterable): raise TypeError('input must be an iterable\n') else: i = 0 out = None if self._method_calling == 3: out = copy.deepcopy(self.X) for (idx, x) in enumerate(iter(X)): is_iter = isinstance(x, Iterable) if is_iter: x = list(x) if is_iter and (len(x) == 0 or len(x) >= 2): if len(x) == 0: warnings.warn('Ignoring empty element' + ' on index: '+str(idx)) continue elif len(x) >= 2: x = Graph(x[0], x[1], {}, self._graph_format) elif type(x) is not Graph: raise TypeError('each element of X must have either ' + 'a graph with labels for node and edge ' + 'or 3 elements consisting of a graph ' + 'type object, labels for vertices and ' + 'labels for edges.') out = big_dag_append(make_big_dag(x, self.h_), out, merge_features=False) i += 1 if self._method_calling == 1: self._nx = i elif self._method_calling == 3: self._ny = i if i == 0: raise ValueError ('parsed input is empty') return out
[docs] def fit_transform(self, X, y=None): """Fit and transform, on the same dataset. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). If None the kernel matrix is calculated upon fit data. The test samples. y : Object, default=None Ignored argument, added for the pipeline. Returns ------- K : numpy array, shape = [_nx, _nx] corresponding to the kernel matrix, a calculation between all pairs of graphs between target an features """ self._method_calling = 2 self.fit(X) # calculate feature matrices. ref = dict(enumerate(self.X[0].keys())) C = np.empty(shape=(len(ref), 1)) phi = np.empty(shape=(len(ref), self._nx)) for (i, v) in iteritems(ref): # number of identical subtrees # equal the D element C[i] = self.X[0][v][0] for j in range(self._nx): phi[i, j] = self.X[0][v][1][j] km = np.dot(phi.T, np.multiply(phi, C)) self._X_diag = np.diagonal(km) if self.normalize: return np.divide(km, np.sqrt(np.outer(self._X_diag, self._X_diag))) else: return km return km
[docs] def transform(self, X): """Calculate the kernel matrix, between given and fitted dataset. Parameters ---------- X : iterable Each element must be an iterable with at most three features and at least one. The first that is obligatory is a valid graph structure (adjacency matrix or edge_dictionary) while the second is node_labels and the third edge_labels (that fitting the given graph format). If None the kernel matrix is calculated upon fit data. The test samples. Returns ------- K : numpy array, shape = [n_targets, n_input_graphs] corresponding to the kernel matrix, a calculation between all pairs of graphs between target an features """ self._method_calling = 3 # Check is fit had been called check_is_fitted(self, ['X']) # Input validation and parsing if X is None: raise ValueError('transform input cannot be None') else: full_dag = self.parse_input(X) ref = dict(enumerate(full_dag[0].keys())) C = np.empty(shape=(len(ref), 1)) phi_x = np.empty(shape=(len(ref), self._nx)) phi_y = np.empty(shape=(len(ref), self._ny)) for (i, v) in enumerate(ref.keys()): # number of identical subtrees equal the D element C[i] = full_dag[0][v][0] for j in range(self._nx): phi_x[i, j] = full_dag[0][v][1][j] for j in range(self._ny): phi_y[i, j] = full_dag[0][v][1][j + self._nx] self._phi_x, self._phi_y, self._C = phi_x, phi_y, C km = np.dot(phi_y.T, np.multiply(phi_x, C)) self._is_transformed = True if self.normalize: X_diag, Y_diag = self.diagonal() km /= np.sqrt(np.outer(Y_diag, X_diag)) return km
[docs] def diagonal(self): """Calculate the kernel matrix diagonal of the fitted data. Parameters ---------- None. Returns ------- X_diag : np.array The diagonal of the kernel matrix, of the fitted. This consists of each element calculated with itself. """ # Check is fit had been called check_is_fitted(self, ['_phi_x', '_C']) try: check_is_fitted(self, ['_X_diag']) except NotFittedError: # Calculate diagonal of X self._X_diag = np.dot(np.square(self._phi_X), self._C).reshape((self._nx, 1)) try: check_is_fitted(self, ['_phi_y']) Y_diag = np.dot(np.square(self._phi_y).T, self._C).reshape((self._ny, 1)) return self._X_diag, Y_diag except NotFittedError: return self._X_diag
def make_big_dag(g, h): """Compose a big dag out of all dags of a graph. Parameters ---------- g : graph A graph type object. h : int, default=None Maximum (single) dag height. If None there is no restriction. Returns ------- big_dag : tuple The big dag tuple consisting of: + A dictionary for vertices containing at each value the frequency the depth and the ID + A hash map from each ID to a list of vertices. + A list of sorted vertices based on ordering + A dictionary of edges. + A dictionary of labels """ big_dag = None for v in g.get_vertices(purpose='any'): dag_odd = make_dag_odd(v, g, h) dag = tuple(hash_trees(dag_odd))+tuple([])+(dag_odd[1], dag_odd[3]) big_dag = big_dag_append(dag, big_dag) _, D_edges, D_ordering, _ = odd(big_dag[0], big_dag[2], big_dag[3]) big_dag = (big_dag[0], big_dag[1], sorted(list(big_dag[0].keys()), key=lambda x: (D_ordering[x], big_dag[3][x])), D_edges, big_dag[3]) return big_dag def make_dag_odd(v, g, h): """Calculate the vertex rooted dag and apply inverse topological sorting. Parameters ---------- v : hashable The dag vertex root. g : graph A graph type object from where v is coming from. h : int, default=None Maximum depth of the exploration. Returns ------- odd_dag : tuple A a tuple representing the topologically sorted dag: + A set of vertices + A dictionary of sorted edges for each node based on ordering and labels + A dictionary of ordering for each node + A dictionary of labels for each node """ vertices, edges = dag(v, g, h) return odd(vertices, edges, g.get_labels(purpose='any')) def dag(v, g, h): """BFS exploration that returns a dag. Parameters ---------- v : hashable The dag vertex root. g : graph A graph type object from where v is coming from. h : int, default=None Maximum depth of the exploration. Returns ------- vertices : set The dag vertices. edges : dict The dictionary of edges. For each node a list of vertices. """ q = [(v, 0)] vertices = dict() edges = defaultdict(list) vertices[v] = 0 while len(q) > 0: u, level = q.pop(0) if level == h: break for n in g.neighbors(u, purpose='any'): if n not in vertices: edges[u].append(n) q.append((n, level+1)) vertices[n] = level+1 elif vertices[n] >= level+1: edges[u].append(n) vertices = set(vertices.keys()) return vertices, edges def odd(vertices, edges, labels): """Calculate the inverse topological order of a DAG and sorts it's edges. Parameters ---------- vertices : dict or set A set of vertices. edges : dict Edges between vertices. labels : dict Labels for each vertex. Returns ------- vertices : dict or set A dictionary or a set of vertices depending on the given input. edges : dict An edges dictionary, where for each edge the list of adjacent vertex is sorted corresponding to the given ordering or labels. ordering : dict The inverse topological ordering for each vertex. labels : dict The labels dictionary for each vertex. """ # Kahn's algorithm for topological sorting of dags # Step-1: Compute in-degree (number of incoming edges) # for each of the vertex present in the DAG and initialize # the count of visited nodes as 0. indegrees = dict() if type(vertices) is set: zero_indegrees = vertices.copy() visited_nodes = len(vertices) elif type(vertices) is dict: zero_indegrees = set(vertices.keys()) visited_nodes = len(vertices.keys()) else: raise TypeError('unsupported vertices type') for (k, e) in iteritems(edges): for v in e: if v not in indegrees: indegrees[v] = 1 else: indegrees[v] += 1 zero_indegrees.discard(v) # Step-2: Pick all the vertices with in-degree as # 0 and add them into a queue q = list(zero_indegrees) ordering = dict() while len(q) > 0: # Step-3: Remove a vertex from the queue and then # Increment count of visited nodes by 1. # Decrease in-degree by 1 for all its neighboring nodes. # If in-degree of a neighboring nodes is reduced to zero, # then add it to the queue. q.sort(key=lambda x: labels[x]) e = q.pop(0) ordering[e] = visited_nodes for k in edges[e]: if k in indegrees: if indegrees[k] == 1: indegrees.pop(k) q.append(k) else: indegrees[k] -= 1 visited_nodes -= 1 # apply the ordering for k in edges.keys(): edges[k].sort(key=lambda x: (ordering[x], labels[x])) return vertices, edges, ordering, labels def hash_trees(tree): """Hashes trees and adds frequencies and a hash map. Parameters ---------- tree : tuple A tuple of elements corresponding to a tree: + A set of vertices + A dictionary of edges + A dictionary that corresponds to an ordering of vertices + A dictionary of labels Returns ------- hash_tree : tuple A hashed version of the tree: + A dictionary from vertices to tuples containing the subtree size, frequencies and node ID + A dictionary between hashes and vertices (representing a valid hash map) + A list of ordered vertices, based on an the inverse topological ordering """ (vertex, edge, ordering, labels) = tree v_ordered = sorted(list(vertex), key=lambda x: (ordering[x], labels[x])) vertices_hash_map = dict() vertices = dict() for v in v_ordered: if v not in edge or len(edge[v]) == 0: if labels[v] not in vertices_hash_map: vertices_hash_map[labels[v]] = list() vertices_hash_map[labels[v]].append(v) vertices[v] = [0, 1, str(labels[v])] else: neighbors_ids = [] d = 0 for n in edge[v]: d += 1 + vertices[n][0] neighbors_ids.append(vertices[n][2]) ID = str(labels[v])+'('+(','.join(neighbors_ids))+')' vertices[v] = [d, 1, ID] if ID not in vertices_hash_map: vertices_hash_map[ID] = list() vertices_hash_map[ID].append(v) return (vertices, vertices_hash_map, v_ordered) def big_dag_append(dag, big_dag=None, merge_features=True): """Calculate the *minimal DAG* or *BigDAG*. See :cite:`Martino2006` and notated in :cite:`Martino2012ATK`. Parameters ---------- dag : tuple A tuple representing a single dag containing: + A dictionary from vertices to tuples containing the subtree size, frequencies and node ID + A dictionary between hashes and vertices (representing a valid hash map) + A list of ordered vertices, based on inverse topological ordering + A dictionary corresponding to edges from vertex to a list of adjacent vertices + A dictionary of labels big_dag : tuple, default=None The dag on which the dag will be appended. If None: builds it from dag, else the tuple is the same as the format of this function output. merge_features : bool, default=True If True increments frequencies when a same element is found, else keeps them as vectors. Returns ------- big_dag : tuple A tuple representing Big_Dag: + A dictionary from vertices to tuples containing the subtree size, frequencies and node ID + A dictionary between hashes and vertices (representing a valid hash map) + A dictionary corresponding to edges from vertex to a list of adjacent vertices. + A dictionary of labels. """ if big_dag is None: D_labels = dict() D_hash_map = dict() D_vertices = dict() D_edges = dict() nodes_idx = 0 nf = 1 else: (D_vertices, D_hash_map, D_edges, D_labels) = big_dag nodes_idx = len(D_vertices.keys()) if not merge_features: f = True for v in D_vertices.keys(): D_vertices[v][1].append(0) if f: nf = len(D_vertices[v][1]) f = False if f: nf = 1 (vertices, vertices_hash_map, v_ordered, edges, labels) = dag for q in v_ordered: key = vertices[q][2] if key in D_hash_map: node = D_hash_map[key][0] # update frequency if merge_features: D_vertices[node][1] += vertices[q][1] else: D_vertices[node][1][-1] += vertices[q][1] else: D_labels[nodes_idx] = labels[q] d_edges = list() # in order to avoid collisions v_nodes = set() for c in edges[q]: ck = vertices[c][2] if ck in D_hash_map: node = D_hash_map[ck][0] # add edges with their new indexes if node not in v_nodes: d_edges.append(node) v_nodes.add(node) D_edges[nodes_idx] = d_edges D_hash_map[key] = [nodes_idx] if merge_features: freq = vertices[q][1] else: freq = (nf-1)*[0]+[vertices[q][1]] D_vertices[nodes_idx] = [vertices[q][1], freq, key] nodes_idx += 1 return (D_vertices, D_hash_map, D_edges, D_labels) if __name__ == "__main__": # Dag Example from original paper tree_a = ({0, 1, 2, 3}, {0: [1, 2], 1: [3], 2: [], 3: []}, {0: 'a', 1: 'b', 2: 'd', 3: 'c'}) tree_b = ({0, 1, 2, 3, 4}, {0: [1, 2], 1: [3], 2: [4], 3: [], 4: []}, {0: 'a', 1: 'b', 2: 'c', 3: 'c', 4: 'd'}) todd_a = odd(tree_a[0], tree_a[1], tree_a[2]) todd_b = odd(tree_b[0], tree_b[1], tree_b[2]) Atree = tuple(hash_trees(todd_a))+tuple([])+(todd_a[1], todd_a[3]) Btree = tuple(hash_trees(todd_b))+tuple([])+(todd_b[1], todd_b[3]) big_dag = None big_dag = big_dag_append(Atree, big_dag) big_dag = big_dag_append(Btree, big_dag) print("Tree A:\n", Atree, "\nTree B:\n", Btree, "\nBig Dag:\n", big_dag)