Note
Click here to download the full example code
Retrieval of most similar document using the Weisfeiler-Lehman subtree kernel.¶
Script makes use of grakel.WeisfeilerLehman, grakel.VertexHistogram
from __future__ import print_function
print(__doc__)
import numpy as np
import time
from nltk import word_tokenize
from nltk.corpus import sentence_polarity
from grakel.kernels import WeisfeilerLehman, VertexHistogram
from grakel import Graph
sents = sentence_polarity.sents()
sents = [sent for sent in sents if len(sent) > 1]
n_sents = 3000
sents = sents[:n_sents]
print("Loaded %d sentences\n" % n_sents)
print("Creating word co-occurrence networks\n")
word_networks = list()
for sent in sents:
    node_labels = dict()
    tokens_to_ids = dict()
    for token in sent:
        if token not in tokens_to_ids:
            tokens_to_ids[token] = len(tokens_to_ids)
            node_labels[tokens_to_ids[token]] = token
    edges = list()
    for i in range(len(sent)-1):
        edges.append((tokens_to_ids[sent[i]], tokens_to_ids[sent[i+1]]))
    word_networks.append(Graph(edges, node_labels=node_labels))
query_sent_id = 54
query_sent = [word_networks[query_sent_id]]
# Initialize Weisfeiler-Lehman subtree kernel
gk = WeisfeilerLehman(niter=2, normalize=True, base_graph_kernel=VertexHistogram)
print("Computing similarities\n")
t0 = time.time()
gk.fit(query_sent)
K = gk.transform(word_networks)
print("done in %0.3fs\n" % (time.time() - t0))
print("Query sentence")
print("--------------")
print(" ".join(sents[query_sent_id]))
print()
print("Most similar sentence")
print("---------------------")
print(" ".join(sents[np.argsort(K[:,0])[-2]]))
Total running time of the script: ( 0 minutes 0.000 seconds)