Source code for grakel.datasets.base

"""The base file for loading default datasets."""
# Python 2/3 cross-compatibility import
from __future__ import print_function

import os
import shutil
import zipfile
import ssl
try:
    # Python 2
    from urllib2 import HTTPError
    from urllib2 import urlopen
except ImportError:
    # Python 3+
    from urllib.error import HTTPError
    from urllib.request import urlopen

import numpy as np

from shutil import copyfileobj

from collections import Counter

from sklearn.utils import Bunch

from grakel.graph import Graph

global datasets_metadata, symmetric_dataset

dataset_metadata = {
    "AIDS": {"nl": True, "el": True, "na": True, "ea": False,
             "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
             "morris/graphkerneldatasets/AIDS.zip"},
    "BZR": {"nl": True, "el": False, "na": True, "ea": False,
            "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
            "morris/graphkerneldatasets/BZR.zip"},
    "BZR_MD": {"nl": True, "el": True, "na": False, "ea": True,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
               "morris/graphkerneldatasets/BZR_MD.zip"},
    "COIL-DEL": {"nl": False, "el": True, "na": True, "ea": False,
                 "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
                 "graphkerneldatasets/COIL-DEL.zip"},
    "COIL-RAG": {"nl": False, "el": False, "na": True, "ea": True,
                 "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
                 "graphkerneldatasets/COIL-RAG.zip"},
    "COLLAB": {"nl": False, "el": False, "na": False, "ea": False,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
               "graphkerneldatasets/COLLAB.zip"},
    "COX2": {"nl": True, "el": False, "na": True, "ea": False,
             "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
             "graphkerneldatasets/COX2.zip"},
    "COX2_MD": {"nl": True, "el": True, "na": False, "ea": True,
                "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
                "graphkerneldatasets/COX2_MD.zip"},
    "DHFR": {"nl": True, "el": False, "na": True, "ea": False,
             "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
             "graphkerneldatasets/DHFR.zip"},
    "DHFR_MD": {"nl": True, "el": True, "na": False, "ea": True,
                "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
                "graphkerneldatasets/DHFR_MD.zip"},
    "ER_MD": {"nl": True, "el": True, "na": False, "ea": True,
              "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
              "graphkerneldatasets/ER_MD.zip"},
    "DD": {"nl": True, "el": False, "na": False, "ea": False,
           "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
           "graphkerneldatasets/DD.zip"},
    "ENZYMES": {"nl": True, "el": False, "na": True, "ea": False,
                "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
                "graphkerneldatasets/ENZYMES.zip"},
    "Cuneiform": {"nl": True, "el": True, "na": True, "ea": True,
                  "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/"
                          "graphkerneldatasets/Cuneiform.zip"},
    "FINGERPRINT": {"nl": False, "el": False, "na": True, "ea": True,
                    "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                    "morris/graphkerneldatasets/Fingerprint.zip"},
    "FIRSTMM_DB": {"nl": True, "el": False, "na": True, "ea": True,
                   "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                   "morris/graphkerneldatasets/FIRSTMM_DB.zip"},
    "FRANKENSTEIN": {"nl": False, "el": False, "na": True, "ea": False,
                     "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                     "ris/graphkerneldatasets/FRANKENSTEIN.zip"},
    "IMDB-BINARY": {"nl": False, "el": False, "na": False, "ea": False,
                    "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                    "ris/graphkerneldatasets/IMDB-BINARY.zip"},
    "IMDB-MULTI": {"nl": False, "el": False, "na": False, "ea": False,
                   "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                   "morris/graphkerneldatasets/IMDB-MULTI.zip"},
    "Letter-high": {"nl": False, "el": False, "na": True, "ea": False,
                    "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                    "ris/graphkerneldatasets/Letter-high.zip"},
    "Letter-low": {"nl": False, "el": False, "na": True, "ea": False,
                   "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                   "ris/graphkerneldatasets/Letter-low.zip"},
    "Letter-med": {"nl": False, "el": False, "na": True, "ea": False,
                   "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                   "ris/graphkerneldatasets/Letter-med.zip"},
    "Mutagenicity": {"nl": True, "el": True, "na": False, "ea": False,
                     "link": "https://ls11-www.cs.uni-dortmund.de/peo" +
                     "ple/morris/graphkerneldatasets/Mutagenicity.zip"},
    "MSRC_9": {"nl": True, "el": False, "na": False, "ea": False,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
               "graphkerneldatasets/MSRC_9.zip"},
    "MSRC_21": {"nl": True, "el": False, "na": False, "ea": False,
                "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                "ris/graphkerneldatasets/MSRC_21.zip"},
    "MSRC_21C": {"nl": True, "el": False, "na": False, "ea": False,
                 "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                 "ris/graphkerneldatasets/MSRC_21C.zip"},
    "MUTAG": {"nl": True, "el": True, "na": False, "ea": False,
              "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
              "ris/graphkerneldatasets/MUTAG.zip"},
    "NCI1": {"nl": True, "el": False, "na": False, "ea": False,
             "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
             "ris/graphkerneldatasets/NCI1.zip"},
    "NCI109": {"nl": True, "el": False, "na": False, "ea": False,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
               "ris/graphkerneldatasets/NCI109.zip"},
    "PTC_FM": {"nl": True, "el": True, "na": False, "ea": False,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
               "ris/graphkerneldatasets/PTC_FM.zip"},
    "PTC_FR": {"nl": True, "el": True, "na": False, "ea": False,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
               "ris/graphkerneldatasets/PTC_FR.zip"},
    "PTC_MM": {"nl": True, "el": True, "na": False, "ea": False,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
               "ris/graphkerneldatasets/PTC_MM.zip"},
    "PTC_MR": {"nl": True, "el": True, "na": False, "ea": False,
               "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
               "ris/graphkerneldatasets/PTC_MR.zip"},
    "PROTEINS": {"nl": True, "el": False, "na": True, "ea": False,
                 "link": "https://ls11-www.cs.uni-dortmund.de/people/mor" +
                 "ris/graphkerneldatasets/PROTEINS.zip"},
    "PROTEINS_full": {"nl": True, "el": False, "na": True, "ea": False,
                      "link": "https://ls11-www.cs.uni-dortmund.de/people" +
                      "/morris/graphkerneldatasets/PROTEINS_full.zip"},
    "REDDIT-BINARY": {"nl": False, "el": False, "na": False, "ea": False,
                      "link": "https://ls11-www.cs.uni-dortmund.de/people" +
                      "/morris/graphkerneldatasets/REDDIT-BINARY.zip"},
    "REDDIT-MULTI-5K": {"nl": False, "el": False, "na": False, "ea": False,
                        "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                        "morris/graphkerneldatasets/REDDIT-MULTI-5K.zip"},
    "REDDIT-MULTI-12K": {"nl": False, "el": False, "na": False, "ea": False,
                         "link": "https://ls11-www.cs.uni-dortmund.de/peop" +
                         "le/morris/graphkerneldatasets/REDDIT-MULTI-12K.zip"},
    "SYNTHETIC": {"nl": False, "el": False, "na": True, "ea": False,
                  "link": "https://ls11-www.cs.uni-dortmund.de/people" +
                  "/morris/graphkerneldatasets/SYNTHETIC.zip"},
    "SYNTHETICnew": {"nl": False, "el": False, "na": True, "ea": False,
                     "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                     "morris/graphkerneldatasets/SYNTHETICnew.zip"},
    "Synthie": {"nl": False, "el": False, "na": True, "ea": False,
                "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                "morris/graphkerneldatasets/Synthie.zip"},
    "Tox21_AHR": {"nl": True, "el": True, "na": False, "ea": False,
                  "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                  "morris/graphkerneldatasets/Tox21_AHR.zip"},
    "Tox21_AR": {"nl": True, "el": True, "na": False, "ea": False,
                 "link": "https://ls11-www.cs.uni-dortmund.de/people/morris/" +
                 "graphkerneldatasets/COX2_MD.zip"},
    "Tox21_AR-LBD": {"nl": True, "el": True, "na": False, "ea": False,
                     "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                     "morris/graphkerneldatasets/Tox21_AR-LBD.zip"},
    "Tox21_ARE": {"nl": True, "el": True, "na": False, "ea": False,
                  "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                  "morris/graphkerneldatasets/Tox21_ARE.zip"},
    "Tox21_aromatase": {"nl": True, "el": True, "na": False, "ea": False,
                        "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                        "morris/graphkerneldatasets/Tox21_aromatase.zip"},
    "Tox21_ATAD5": {"nl": True, "el": True, "na": False, "ea": False,
                    "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                    "morris/graphkerneldatasets/Tox21_ATAD5.zip"},
    "Tox21_ER": {"nl": True, "el": True, "na": False, "ea": False,
                 "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                 "morris/graphkerneldatasets/Tox21_ER.zip"},
    "Tox21_ER_LBD": {"nl": True, "el": True, "na": False, "ea": False,
                     "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                     "morris/graphkerneldatasets/Tox21_ER_LBD.zipp"},
    "Tox21_HSE": {"nl": True, "el": True, "na": False, "ea": False,
                  "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                  "morris/graphkerneldatasets/Tox21_HSE.zip"},
    "Tox21_MMP": {"nl": True, "el": True, "na": False, "ea": False,
                  "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                  "morris/graphkerneldatasets/Tox21_MMP.zip"},
    "Tox21_p53": {"nl": True, "el": True, "na": False, "ea": False,
                  "link": "https://ls11-www.cs.uni-dortmund.de/people/" +
                  "morris/graphkerneldatasets/Tox21_p53.zip"},
    "Tox21_PPAR-gamma": {"nl": True, "el": True, "na": False, "ea": False,
                         "link": "https://ls11-www.cs.uni-dortmund.de/peop" +
                         "le/morris/graphkerneldatasets/Tox21_PPAR-gamma.zip"}
}

symmetric_dataset = False


def read_data(
        name,
        with_classes=True,
        prefer_attr_nodes=False,
        prefer_attr_edges=False,
        produce_labels_nodes=False,
        as_graphs=False,
        is_symmetric=symmetric_dataset):
    """Create a dataset iterable for GraphKernel.

    Parameters
    ----------
    name : str
        The dataset name.

    with_classes : bool, default=False
        Return an iterable of class labels based on the enumeration.

    produce_labels_nodes : bool, default=False
        Produce labels for nodes if not found.
        Currently this means labeling its node by its degree inside the Graph.
        This operation is applied only if node labels are non existent.

    prefer_attr_nodes : bool, default=False
        If a dataset has both *node* labels and *node* attributes
        set as labels for the graph object for *nodes* the attributes.

    prefer_attr_edges : bool, default=False
        If a dataset has both *edge* labels and *edge* attributes
        set as labels for the graph object for *edge* the attributes.

    as_graphs : bool, default=False
        Return data as a list of Graph Objects.

    is_symmetric : bool, default=False
        Defines if the graph data describe a symmetric graph.

    Returns
    -------
    Gs : iterable
        An iterable of graphs consisting of a dictionary, node
        labels and edge labels for each graph.

    classes : np.array, case_of_appearance=with_classes==True
        An one dimensional array of graph classes aligned with the lines
        of the `Gs` iterable. Useful for classification.

    """
    indicator_path = "./"+str(name)+"/"+str(name)+"_graph_indicator.txt"
    edges_path = "./" + str(name) + "/" + str(name) + "_A.txt"
    node_labels_path = "./" + str(name) + "/" + str(name) + "_node_labels.txt"
    node_attributes_path = "./"+str(name)+"/"+str(name)+"_node_attributes.txt"
    edge_labels_path = "./" + str(name) + "/" + str(name) + "_edge_labels.txt"
    edge_attributes_path = \
        "./" + str(name) + "/" + str(name) + "_edge_attributes.txt"
    graph_classes_path = \
        "./" + str(name) + "/" + str(name) + "_graph_labels.txt"

    # node graph correspondence
    ngc = dict()
    # edge line correspondence
    elc = dict()
    # dictionary that keeps sets of edges
    Graphs = dict()
    # dictionary of labels for nodes
    node_labels = dict()
    # dictionary of labels for edges
    edge_labels = dict()

    # Associate graphs nodes with indexes
    with open(indicator_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            ngc[i] = int(line[:-1])
            if int(line[:-1]) not in Graphs:
                Graphs[int(line[:-1])] = set()
            if int(line[:-1]) not in node_labels:
                node_labels[int(line[:-1])] = dict()
            if int(line[:-1]) not in edge_labels:
                edge_labels[int(line[:-1])] = dict()

    # Extract graph edges
    with open(edges_path, "r") as f:
        for (i, line) in enumerate(f, 1):
            edge = line[:-1].replace(' ', '').split(",")
            elc[i] = (int(edge[0]), int(edge[1]))
            Graphs[ngc[int(edge[0])]].add((int(edge[0]), int(edge[1])))
            if is_symmetric:
                Graphs[ngc[int(edge[1])]].add((int(edge[1]), int(edge[0])))

    # Extract node attributes
    if (prefer_attr_nodes and
        dataset_metadata[name].get(
                "na",
                os.path.exists(node_attributes_path)
                )):
        with open(node_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = \
                    [float(num) for num in
                     line[:-1].replace(' ', '').split(",")]
    # Extract node labels
    elif dataset_metadata[name].get(
            "nl",
            os.path.exists(node_labels_path)
            ):
        with open(node_labels_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                node_labels[ngc[i]][i] = int(line[:-1])
    elif produce_labels_nodes:
        for i in range(1, len(Graphs)+1):
            node_labels[i] = dict(Counter(s for (s, d) in Graphs[i] if s != d))

    # Extract edge attributes
    if (prefer_attr_edges and
        dataset_metadata[name].get(
            "ea",
            os.path.exists(edge_attributes_path)
            )):
        with open(edge_attributes_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                attrs = [float(num)
                         for num in line[:-1].replace(' ', '').split(",")]
                edge_labels[ngc[elc[i][0]]][elc[i]] = attrs
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = attrs

    # Extract edge labels
    elif dataset_metadata[name].get(
            "el",
            os.path.exists(edge_labels_path)
            ):
        with open(edge_labels_path, "r") as f:
            for (i, line) in enumerate(f, 1):
                edge_labels[ngc[elc[i][0]]][elc[i]] = int(line[:-1])
                if is_symmetric:
                    edge_labels[ngc[elc[i][1]]][(elc[i][1], elc[i][0])] = \
                        int(line[:-1])

    Gs = list()
    if as_graphs:
        for i in range(1, len(Graphs)+1):
            Gs.append(Graph(Graphs[i], node_labels[i], edge_labels[i]))
    else:
        for i in range(1, len(Graphs)+1):
            Gs.append([Graphs[i], node_labels[i], edge_labels[i]])

    if with_classes:
        classes = []
        with open(graph_classes_path, "r") as f:
            for line in f:
                classes.append(int(line[:-1]))

        classes = np.array(classes, dtype=np.int)
        return Bunch(data=Gs, target=classes)
    else:
        return Bunch(data=Gs)


def _download_zip(url, output_name):
    """Download a file from a requested url and store locally.

    Parameters
    ----------
    url : str
        The url from where the file will be downloaded.

    output_name : str
        The name of the file in the local directory.

    Returns
    -------
    None.

    """
    ctx = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
    filename = output_name + ".zip"
    try:
        data_url = urlopen(url, context=ctx)
    except HTTPError as e:
        if e.code == 404:
            e.msg = "Dataset '%s' not found on mldata.org." % output_name
        raise

    # Store Zip File
    try:
        with open(filename, 'w+b') as zip_file:
            copyfileobj(data_url, zip_file)
    except Exception:
        os.remove(filename)
        raise
    data_url.close()


[docs]def fetch_dataset( name, verbose=True, data_home=None, download_if_missing=True, with_classes=True, produce_labels_nodes=False, prefer_attr_nodes=False, prefer_attr_edges=False, as_graphs=False): """Access a large collection of benchmark datasets from TU Dortmund :cite:`KKMMN2016`. For more info visit: :xref:`gd` Parameters ---------- name : str The name of the dataset (as found in :xref:`gd`). verbose : bool, default=True Print messages, throughout execution. data_home : string, default=None Specify another download and cache folder for the datasets. By default all grakel data is stored in '~/grakel_data' subfolders. download_if_missing : boolean, default=True If False, raise a IOError if the data is not locally available instead of trying to download the data from the source site. with_classes : bool, default=False Return an iterable of class labels based on the enumeration. produce_labels_nodes : bool, default=False Produce labels for nodes if not found. Currently this means labeling its node by its degree inside the Graph. This operation is applied only if node labels are non existent. prefer_attr_nodes : bool, default=False If a dataset has both *node* labels and *node* attributes set as labels for the graph object for *nodes* the attributes. prefer_attr_edges : bool, default=False If a dataset has both *edge* labels and *edge* attributes set as labels for the graph object for *edge* the attributes. as_graphs : bool, default=False Return data as a list of Graph Objects. Returns ------- graphs : iterable Returns an iterable of the produced *valid-graph-format* and labels for each node. classes : list Returns a list of all the classes corresponding to each graph by order of input. """ name = str(name) if name in dataset_metadata: if data_home is None: data_home = os.path.join(os.path.expanduser("~"), 'grakel_data') exists = os.path.isdir(data_home) missing = not os.path.exists(os.path.join(data_home, name + ".zip")) cwd = os.getcwd() if missing: if download_if_missing: if not exists: if verbose: print("Initializing folder at", str(data_home)) os.makedirs(data_home) os.chdir(data_home) if verbose: print("Downloading dataset for", name + "..") _download_zip(dataset_metadata[name]["link"], name) else: raise IOError('Dataset ' + name + ' was not found on ' + str(data_home)) else: # move to the general data directory os.chdir(data_home) with zipfile.ZipFile(str(name) + '.zip', "r") as zip_ref: if verbose: print("Extracting dataset ", str(name) + "..") zip_ref.extractall() if verbose: print("Parsing dataset ", str(name) + "..") data = read_data(name, with_classes=with_classes, prefer_attr_nodes=prefer_attr_nodes, prefer_attr_edges=prefer_attr_edges, produce_labels_nodes=produce_labels_nodes, is_symmetric=symmetric_dataset, as_graphs=as_graphs) if verbose: print("Parse was succesful..") if verbose: print("Deleting unzipped dataset files..") shutil.rmtree(str(name)) if verbose: print("Going back to the original directory..") os.chdir(cwd) return data else: raise ValueError('Dataset: "'+str(name)+'" is currently unsupported.' + '\nSupported datasets come from ' 'https://ls11-www.cs.tu-dortmund.de/staff/morris/' + 'graphkerneldatasets. If your dataset name appears' + ' them send us a pm, to explain you either why we ' + 'don\'t support it, or to update our dataset ' + 'database.')
[docs]def get_dataset_info(dataset_name, default=None): """Return the info concerning the existence of a certain dataset. Parameters ---------- dataset_name : str The name of the dataset. default : Object, default=None The default return value if the dataset is not found. Returns ------- dictionary_get : default or dictionary_entry The info of a dataset as a dictionary with fields: - **nl** : A boolean flag indicating if the dataset has node labels. - **el** : A boolean flag indicating if the dataset has edge labels. - **na** : A boolean flag indicating if the dataset has node attributes. - **ea** : A boolean flag indicating if the dataset has edge attributes. - **link** : A str corresponding to the download link of the dataset. """ return dataset_metadata.get(dataset_name, default)