Source code for grakel.kernels.odd_sth

"""The ODD-Sth kernel as defined in :cite:`Martino2012ATK`."""
# Author: Ioannis Siglidis <y.siglidis@gmail.com>
# License: BSD 3 clause
import copy
import warnings

import numpy as np

from collections import Iterable
from collections import defaultdict

from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

from grakel.kernels import Kernel
from grakel.graph import Graph

# Python 2/3 cross-compatibility import
from six import iteritems


[docs]class OddSth(Kernel):
    """ODD-Sth kernel as proposed in :cite:`Martino2012ATK`.

    Parameters
    ----------
    h : int, default=None
        Maximum (single) dag height.
        If None there is no restriction.

    Attributes
    ----------
    _nx : int
        The number of parsed inputs on fit.

    _ny : int
        The number of parsed inputs on transform.

    _phi_x : np.array, n_dim=2
        A numpy array corresponding all the frequency values for
        each vertex, coresponding to the fitted data, in the
        resulting bigDAG of the fitted and transformed data.

    _phi_y : np.array, n_dim=2
        A numpy array corresponding all the frequency values for
        each vertex, corresponding to the transformed data, in the
        resulting bigDAG of the fitted and transformed data.

    _C : np.array, n_dim=1
        A numpy array corresponding to the vertex depth of each node, in
        the resulting bigDAG of the fitted and transformed data.

    """

[docs]    def __init__(self, n_jobs=None, normalize=False, verbose=False, h=None):
        """Initialise an `odd_sth` kernel."""
        super(OddSth, self).__init__(n_jobs=n_jobs,
                                     normalize=normalize,
                                     verbose=verbose)
        self.h = h
        self._initialized.update({"h": False})

[docs]    def initialize(self):
        """Initialize all transformer arguments, needing initialization."""
        if not self._initialized["n_jobs"]:
            if self.n_jobs is not None:
                warnings.warn('no implemented parallelization for OddSth')
            self._initialized["n_jobs"] = True

        if not self._initialized["h"]:
            if self.h is not None and (type(self.h) is not int or self.h <= 0):
                raise ValueError('h must be an integer bigger than zero')

            self.h_ = (-1 if self.h is None else self.h)
            self._initialized["h"] = True

[docs]    def parse_input(self, X):
        """Parse and create features for the propagation kernel.

        Parameters
        ----------
        X : iterable
            For the input to pass the test, we must have:
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that correspond to the given
            graph format). A valid input also consists of graph type objects.

        Returns
        -------
        out : tuple
            A tuple corresponding to the calculated bigDAG.

        """
        if not isinstance(X, Iterable):
            raise TypeError('input must be an iterable\n')
        else:
            i = 0
            out = None
            if self._method_calling == 3:
                out = copy.deepcopy(self.X)
            for (idx, x) in enumerate(iter(X)):
                is_iter = isinstance(x, Iterable)
                if is_iter:
                    x = list(x)
                if is_iter and (len(x) == 0 or len(x) >= 2):
                    if len(x) == 0:
                        warnings.warn('Ignoring empty element' +
                                      ' on index: '+str(idx))
                        continue
                    elif len(x) >= 2:
                        x = Graph(x[0], x[1], {}, self._graph_format)
                elif type(x) is not Graph:
                    raise TypeError('each element of X must have either ' +
                                    'a graph with labels for node and edge ' +
                                    'or 3 elements consisting of a graph ' +
                                    'type object, labels for vertices and ' +
                                    'labels for edges.')
                out = big_dag_append(make_big_dag(x, self.h_), out, merge_features=False)
                i += 1

            if self._method_calling == 1:
                self._nx = i
            elif self._method_calling == 3:
                self._ny = i
            if i == 0:
                raise ValueError
                ('parsed input is empty')
            return out

[docs]    def fit_transform(self, X, y=None):
        """Fit and transform, on the same dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        y : Object, default=None
            Ignored argument, added for the pipeline.

        Returns
        -------
        K : numpy array, shape = [_nx, _nx]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 2
        self.fit(X)

        # calculate feature matrices.
        ref = dict(enumerate(self.X[0].keys()))

        C = np.empty(shape=(len(ref), 1))
        phi = np.empty(shape=(len(ref), self._nx))
        for (i, v) in iteritems(ref):
            # number of identical subtrees
            # equal the D element
            C[i] = self.X[0][v][0]
            for j in range(self._nx):
                phi[i, j] = self.X[0][v][1][j]

        km = np.dot(phi.T, np.multiply(phi, C))

        self._X_diag = np.diagonal(km)
        if self.normalize:
            return np.divide(km, np.sqrt(np.outer(self._X_diag, self._X_diag)))
        else:
            return km
        return km

[docs]    def transform(self, X):
        """Calculate the kernel matrix, between given and fitted dataset.

        Parameters
        ----------
        X : iterable
            Each element must be an iterable with at most three features and at
            least one. The first that is obligatory is a valid graph structure
            (adjacency matrix or edge_dictionary) while the second is
            node_labels and the third edge_labels (that fitting the given graph
            format). If None the kernel matrix is calculated upon fit data.
            The test samples.

        Returns
        -------
        K : numpy array, shape = [n_targets, n_input_graphs]
            corresponding to the kernel matrix, a calculation between
            all pairs of graphs between target an features

        """
        self._method_calling = 3
        # Check is fit had been called
        check_is_fitted(self, ['X'])

        # Input validation and parsing
        if X is None:
            raise ValueError('transform input cannot be None')
        else:
            full_dag = self.parse_input(X)

        ref = dict(enumerate(full_dag[0].keys()))

        C = np.empty(shape=(len(ref), 1))
        phi_x = np.empty(shape=(len(ref), self._nx))
        phi_y = np.empty(shape=(len(ref), self._ny))
        for (i, v) in enumerate(ref.keys()):
            # number of identical subtrees equal the D element
            C[i] = full_dag[0][v][0]
            for j in range(self._nx):
                phi_x[i, j] = full_dag[0][v][1][j]
            for j in range(self._ny):
                phi_y[i, j] = full_dag[0][v][1][j + self._nx]

        self._phi_x, self._phi_y, self._C = phi_x, phi_y, C
        km = np.dot(phi_y.T, np.multiply(phi_x, C))
        self._is_transformed = True
        if self.normalize:
            X_diag, Y_diag = self.diagonal()
            km /= np.sqrt(np.outer(Y_diag, X_diag))
        return km

[docs]    def diagonal(self):
        """Calculate the kernel matrix diagonal of the fitted data.

        Parameters
        ----------
        None.

        Returns
        -------
        X_diag : np.array
            The diagonal of the kernel matrix, of the fitted. This consists
            of each element calculated with itself.


        """
        # Check is fit had been called
        check_is_fitted(self, ['_phi_x', '_C'])
        try:
            check_is_fitted(self, ['_X_diag'])
        except NotFittedError:
            # Calculate diagonal of X
            self._X_diag = np.dot(np.square(self._phi_X),
                                  self._C).reshape((self._nx, 1))

        try:
            check_is_fitted(self, ['_phi_y'])
            Y_diag = np.dot(np.square(self._phi_y).T,
                            self._C).reshape((self._ny, 1))
            return self._X_diag, Y_diag
        except NotFittedError:
            return self._X_diag


def make_big_dag(g, h):
    """Compose a big dag out of all dags of a graph.

    Parameters
    ----------
    g : graph
        A graph type object.

    h : int, default=None
        Maximum (single) dag height.
        If None there is no restriction.

    Returns
    -------
    big_dag : tuple
        The big dag tuple consisting of:
            + A dictionary for vertices containing at each value the frequency
              the depth and the ID
            + A hash map from each ID to a list of vertices.
            + A list of sorted vertices based on ordering
            + A dictionary of edges.
            + A dictionary of labels

    """
    big_dag = None
    for v in g.get_vertices(purpose='any'):
        dag_odd = make_dag_odd(v, g, h)
        dag = tuple(hash_trees(dag_odd))+tuple([])+(dag_odd[1], dag_odd[3])
        big_dag = big_dag_append(dag, big_dag)

    _, D_edges, D_ordering, _ = odd(big_dag[0], big_dag[2], big_dag[3])

    big_dag = (big_dag[0],
               big_dag[1],
               sorted(list(big_dag[0].keys()),
               key=lambda x:
                      (D_ordering[x], big_dag[3][x])), D_edges, big_dag[3])

    return big_dag


def make_dag_odd(v, g, h):
    """Calculate the vertex rooted dag and apply inverse topological sorting.

    Parameters
    ----------
    v : hashable
        The dag vertex root.

    g : graph
        A graph type object from where v is coming from.

    h : int, default=None
        Maximum depth of the exploration.

    Returns
    -------
    odd_dag : tuple
        A a tuple representing the topologically sorted dag:
            + A set of vertices
            + A dictionary of sorted edges for each node based on ordering
              and labels
            + A dictionary of ordering for each node
            + A dictionary of labels for each node

    """
    vertices, edges = dag(v, g, h)
    return odd(vertices, edges, g.get_labels(purpose='any'))


def dag(v, g, h):
    """BFS exploration that returns a dag.

    Parameters
    ----------
    v : hashable
        The dag vertex root.

    g : graph
        A graph type object from where v is coming from.

    h : int, default=None
        Maximum depth of the exploration.

    Returns
    -------
    vertices : set
        The dag vertices.

    edges : dict
        The dictionary of edges. For each node a list of vertices.

    """
    q = [(v, 0)]
    vertices = dict()
    edges = defaultdict(list)

    vertices[v] = 0
    while len(q) > 0:
        u, level = q.pop(0)

        if level == h:
            break

        for n in g.neighbors(u, purpose='any'):
            if n not in vertices:
                edges[u].append(n)
                q.append((n, level+1))
                vertices[n] = level+1
            elif vertices[n] >= level+1:
                edges[u].append(n)

    vertices = set(vertices.keys())
    return vertices, edges


def odd(vertices, edges, labels):
    """Calculate the inverse topological order of a DAG and sorts it's edges.

    Parameters
    ----------
    vertices : dict or set
        A set of vertices.

    edges : dict
        Edges between vertices.

    labels : dict
        Labels for each vertex.

    Returns
    -------
    vertices : dict or set
        A dictionary or a set of vertices depending on the given input.

    edges : dict
        An edges dictionary, where for each edge the list of adjacent vertex
        is sorted corresponding to the given ordering or labels.

    ordering : dict
        The inverse topological ordering for each vertex.

    labels : dict
        The labels dictionary for each vertex.

    """
    # Kahn's algorithm for topological sorting of dags

    # Step-1: Compute in-degree (number of incoming edges)
    # for each of the vertex present in the DAG and initialize
    # the count of visited nodes as 0.
    indegrees = dict()

    if type(vertices) is set:
        zero_indegrees = vertices.copy()
        visited_nodes = len(vertices)
    elif type(vertices) is dict:
        zero_indegrees = set(vertices.keys())
        visited_nodes = len(vertices.keys())
    else:
        raise TypeError('unsupported vertices type')

    for (k, e) in iteritems(edges):
        for v in e:
            if v not in indegrees:
                indegrees[v] = 1
            else:
                indegrees[v] += 1
            zero_indegrees.discard(v)

    # Step-2: Pick all the vertices with in-degree as
    # 0 and add them into a queue
    q = list(zero_indegrees)
    ordering = dict()
    while len(q) > 0:
        # Step-3: Remove a vertex from the queue and then
        # Increment count of visited nodes by 1.
        # Decrease in-degree by 1 for all its neighboring nodes.
        # If in-degree of a neighboring nodes is reduced to zero,
        # then add it to the queue.

        q.sort(key=lambda x: labels[x])
        e = q.pop(0)
        ordering[e] = visited_nodes
        for k in edges[e]:
            if k in indegrees:
                if indegrees[k] == 1:
                    indegrees.pop(k)
                    q.append(k)
                else:
                    indegrees[k] -= 1
        visited_nodes -= 1

    # apply the ordering
    for k in edges.keys():
        edges[k].sort(key=lambda x: (ordering[x], labels[x]))

    return vertices, edges, ordering, labels


def hash_trees(tree):
    """Hashes trees and adds frequencies and a hash map.

    Parameters
    ----------
    tree : tuple
        A tuple of elements corresponding to a tree:
            + A set of vertices
            + A dictionary of edges
            + A dictionary that corresponds to an ordering of vertices
            + A dictionary of labels

    Returns
    -------
    hash_tree : tuple
        A hashed version of the tree:
            + A dictionary from vertices to tuples containing the subtree size,
              frequencies and node ID
            + A dictionary between hashes and vertices
              (representing a valid hash map)
            + A list of ordered vertices, based on an the inverse
              topological ordering

    """
    (vertex, edge, ordering, labels) = tree
    v_ordered = sorted(list(vertex), key=lambda x: (ordering[x], labels[x]))
    vertices_hash_map = dict()
    vertices = dict()
    for v in v_ordered:
        if v not in edge or len(edge[v]) == 0:
            if labels[v] not in vertices_hash_map:
                vertices_hash_map[labels[v]] = list()
            vertices_hash_map[labels[v]].append(v)
            vertices[v] = [0, 1, str(labels[v])]
        else:
            neighbors_ids = []
            d = 0
            for n in edge[v]:
                d += 1 + vertices[n][0]
                neighbors_ids.append(vertices[n][2])

            ID = str(labels[v])+'('+(','.join(neighbors_ids))+')'

            vertices[v] = [d, 1, ID]
            if ID not in vertices_hash_map:
                vertices_hash_map[ID] = list()
            vertices_hash_map[ID].append(v)

    return (vertices, vertices_hash_map, v_ordered)


def big_dag_append(dag, big_dag=None, merge_features=True):
    """Calculate the *minimal DAG* or *BigDAG*.

    See :cite:`Martino2006` and notated in :cite:`Martino2012ATK`.

    Parameters
    ----------
    dag : tuple
        A tuple representing a single dag containing:
            + A dictionary from vertices to tuples containing the subtree size,
              frequencies and node ID
            + A dictionary between hashes and vertices
              (representing a valid hash map)
            + A list of ordered vertices, based on inverse topological ordering
            + A dictionary corresponding to edges from vertex to a list of
              adjacent vertices
            + A dictionary of labels

    big_dag : tuple, default=None
        The dag on which the dag will be appended.
        If None: builds it from dag, else the tuple is the same as the format
        of this function output.

    merge_features : bool, default=True
        If True increments frequencies when a same element is found, else keeps
        them as vectors.

    Returns
    -------
    big_dag : tuple
        A tuple representing Big_Dag:
            + A dictionary from vertices to tuples containing the subtree size,
              frequencies and node ID
            + A dictionary between hashes and vertices
              (representing a valid hash map)
            + A dictionary corresponding to edges from vertex to a list of
              adjacent vertices.
            + A dictionary of labels.

    """
    if big_dag is None:
        D_labels = dict()
        D_hash_map = dict()
        D_vertices = dict()
        D_edges = dict()
        nodes_idx = 0
        nf = 1
    else:
        (D_vertices, D_hash_map, D_edges, D_labels) = big_dag
        nodes_idx = len(D_vertices.keys())
        if not merge_features:
            f = True
            for v in D_vertices.keys():
                D_vertices[v][1].append(0)
                if f:
                    nf = len(D_vertices[v][1])
                    f = False
            if f:
                nf = 1

    (vertices, vertices_hash_map, v_ordered, edges, labels) = dag
    for q in v_ordered:
        key = vertices[q][2]
        if key in D_hash_map:
            node = D_hash_map[key][0]
            # update frequency
            if merge_features:
                D_vertices[node][1] += vertices[q][1]
            else:
                D_vertices[node][1][-1] += vertices[q][1]
        else:
            D_labels[nodes_idx] = labels[q]
            d_edges = list()
            # in order to avoid collisions
            v_nodes = set()
            for c in edges[q]:
                ck = vertices[c][2]
                if ck in D_hash_map:
                    node = D_hash_map[ck][0]
                    # add edges with their new indexes
                    if node not in v_nodes:
                        d_edges.append(node)
                        v_nodes.add(node)

            D_edges[nodes_idx] = d_edges
            D_hash_map[key] = [nodes_idx]

            if merge_features:
                freq = vertices[q][1]
            else:
                freq = (nf-1)*[0]+[vertices[q][1]]

            D_vertices[nodes_idx] = [vertices[q][1], freq, key]
            nodes_idx += 1

    return (D_vertices, D_hash_map, D_edges, D_labels)


if __name__ == "__main__":
    # Dag Example from original paper
    tree_a = ({0, 1, 2, 3},
              {0: [1, 2], 1: [3], 2: [], 3: []},
              {0: 'a', 1: 'b', 2: 'd', 3: 'c'})
    tree_b = ({0, 1, 2, 3, 4}, {0: [1, 2], 1: [3], 2: [4], 3: [], 4: []},
              {0: 'a', 1: 'b', 2: 'c', 3: 'c', 4: 'd'})

    todd_a = odd(tree_a[0], tree_a[1], tree_a[2])
    todd_b = odd(tree_b[0], tree_b[1], tree_b[2])

    Atree = tuple(hash_trees(todd_a))+tuple([])+(todd_a[1], todd_a[3])
    Btree = tuple(hash_trees(todd_b))+tuple([])+(todd_b[1], todd_b[3])
    big_dag = None
    big_dag = big_dag_append(Atree, big_dag)
    big_dag = big_dag_append(Btree, big_dag)

    print("Tree A:\n", Atree, "\nTree B:\n", Btree, "\nBig Dag:\n", big_dag)