|
|
@@ -0,0 +1,338 @@ |
|
|
|
#!/usr/bin/env python3 |
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
""" |
|
|
|
Created on Wed Jun 3 22:22:57 2020 |
|
|
|
|
|
|
|
@author: ljia |
|
|
|
|
|
|
|
@references: |
|
|
|
|
|
|
|
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between |
|
|
|
labeled graphs. In Proceedings of the 20th International Conference on |
|
|
|
Machine Learning, Washington, DC, United States, 2003. |
|
|
|
|
|
|
|
[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and |
|
|
|
Jean-Philippe Vert. Extensions of marginalized graph kernels. In |
|
|
|
Proceedings of the twenty-first international conference on Machine |
|
|
|
learning, page 70. ACM, 2004. |
|
|
|
""" |
|
|
|
|
|
|
|
import sys |
|
|
|
from multiprocessing import Pool |
|
|
|
from tqdm import tqdm |
|
|
|
import numpy as np |
|
|
|
import networkx as nx |
|
|
|
from gklearn.utils import SpecialLabel |
|
|
|
from gklearn.utils.kernels import deltakernel |
|
|
|
from gklearn.utils.parallel import parallel_gm, parallel_me |
|
|
|
from gklearn.utils.utils import untotterTransformation |
|
|
|
from gklearn.kernels import GraphKernel |
|
|
|
|
|
|
|
|
|
|
|
class Marginalized(GraphKernel): |
|
|
|
|
|
|
|
def __init__(self, **kwargs): |
|
|
|
GraphKernel.__init__(self) |
|
|
|
self.__node_labels = kwargs.get('node_labels', []) |
|
|
|
self.__edge_labels = kwargs.get('edge_labels', []) |
|
|
|
self.__p_quit = kwargs.get('p_quit', 0.5) |
|
|
|
self.__n_iteration = kwargs.get('n_iteration', 10) |
|
|
|
self.__remove_totters = kwargs.get('remove_totters', False) |
|
|
|
self.__ds_infos = kwargs.get('ds_infos', {}) |
|
|
|
self.__n_iteration = int(self.__n_iteration) |
|
|
|
|
|
|
|
|
|
|
|
def _compute_gm_series(self): |
|
|
|
self.__add_dummy_labels(self._graphs) |
|
|
|
|
|
|
|
if self.__remove_totters: |
|
|
|
if self._verbose >= 2: |
|
|
|
iterator = tqdm(self._graphs, desc='removing tottering', file=sys.stdout) |
|
|
|
else: |
|
|
|
iterator = self._graphs |
|
|
|
# @todo: this may not work. |
|
|
|
self._graphs = [untotterTransformation(G, self.__node_labels, self.__edge_labels) for G in iterator] |
|
|
|
|
|
|
|
# compute Gram matrix. |
|
|
|
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) |
|
|
|
|
|
|
|
from itertools import combinations_with_replacement |
|
|
|
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) |
|
|
|
if self._verbose >= 2: |
|
|
|
iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) |
|
|
|
else: |
|
|
|
iterator = itr |
|
|
|
for i, j in iterator: |
|
|
|
kernel = self.__kernel_do(self._graphs[i], self._graphs[j]) |
|
|
|
gram_matrix[i][j] = kernel |
|
|
|
gram_matrix[j][i] = kernel # @todo: no directed graph considered? |
|
|
|
|
|
|
|
return gram_matrix |
|
|
|
|
|
|
|
|
|
|
|
def _compute_gm_imap_unordered(self): |
|
|
|
self.__add_dummy_labels(self._graphs) |
|
|
|
|
|
|
|
if self.__remove_totters: |
|
|
|
pool = Pool(self._n_jobs) |
|
|
|
itr = range(0, len(self._graphs)) |
|
|
|
if len(self._graphs) < 100 * self._n_jobs: |
|
|
|
chunksize = int(len(self._graphs) / self._n_jobs) + 1 |
|
|
|
else: |
|
|
|
chunksize = 100 |
|
|
|
remove_fun = self._wrapper_untotter |
|
|
|
if self._verbose >= 2: |
|
|
|
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), |
|
|
|
desc='removing tottering', file=sys.stdout) |
|
|
|
else: |
|
|
|
iterator = pool.imap_unordered(remove_fun, itr, chunksize) |
|
|
|
for i, g in iterator: |
|
|
|
self._graphs[i] = g |
|
|
|
pool.close() |
|
|
|
pool.join() |
|
|
|
|
|
|
|
# compute Gram matrix. |
|
|
|
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) |
|
|
|
|
|
|
|
def init_worker(gn_toshare): |
|
|
|
global G_gn |
|
|
|
G_gn = gn_toshare |
|
|
|
do_fun = self._wrapper_kernel_do |
|
|
|
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, |
|
|
|
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) |
|
|
|
|
|
|
|
return gram_matrix |
|
|
|
|
|
|
|
|
|
|
|
def _compute_kernel_list_series(self, g1, g_list): |
|
|
|
self.__add_dummy_labels(g_list + [g1]) |
|
|
|
|
|
|
|
if self.__remove_totters: |
|
|
|
g1 = untotterTransformation(g1, self.__node_labels, self.__edge_labels) # @todo: this may not work. |
|
|
|
if self._verbose >= 2: |
|
|
|
iterator = tqdm(g_list, desc='removing tottering', file=sys.stdout) |
|
|
|
else: |
|
|
|
iterator = g_list |
|
|
|
# @todo: this may not work. |
|
|
|
g_list = [untotterTransformation(G, self.__node_labels, self.__edge_labels) for G in iterator] |
|
|
|
|
|
|
|
# compute kernel list. |
|
|
|
kernel_list = [None] * len(g_list) |
|
|
|
if self._verbose >= 2: |
|
|
|
iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) |
|
|
|
else: |
|
|
|
iterator = range(len(g_list)) |
|
|
|
for i in iterator: |
|
|
|
kernel = self.__kernel_do(g1, g_list[i]) |
|
|
|
kernel_list[i] = kernel |
|
|
|
|
|
|
|
return kernel_list |
|
|
|
|
|
|
|
|
|
|
|
def _compute_kernel_list_imap_unordered(self, g1, g_list): |
|
|
|
self.__add_dummy_labels(g_list + [g1]) |
|
|
|
|
|
|
|
if self.__remove_totters: |
|
|
|
g1 = untotterTransformation(g1, self.__node_labels, self.__edge_labels) # @todo: this may not work. |
|
|
|
pool = Pool(self._n_jobs) |
|
|
|
itr = range(0, len(g_list)) |
|
|
|
if len(g_list) < 100 * self._n_jobs: |
|
|
|
chunksize = int(len(g_list) / self._n_jobs) + 1 |
|
|
|
else: |
|
|
|
chunksize = 100 |
|
|
|
remove_fun = self._wrapper_untotter |
|
|
|
if self._verbose >= 2: |
|
|
|
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), |
|
|
|
desc='removing tottering', file=sys.stdout) |
|
|
|
else: |
|
|
|
iterator = pool.imap_unordered(remove_fun, itr, chunksize) |
|
|
|
for i, g in iterator: |
|
|
|
g_list[i] = g |
|
|
|
pool.close() |
|
|
|
pool.join() |
|
|
|
|
|
|
|
# compute kernel list. |
|
|
|
kernel_list = [None] * len(g_list) |
|
|
|
|
|
|
|
def init_worker(g1_toshare, g_list_toshare): |
|
|
|
global G_g1, G_g_list |
|
|
|
G_g1 = g1_toshare |
|
|
|
G_g_list = g_list_toshare |
|
|
|
do_fun = self._wrapper_kernel_list_do |
|
|
|
def func_assign(result, var_to_assign): |
|
|
|
var_to_assign[result[0]] = result[1] |
|
|
|
itr = range(len(g_list)) |
|
|
|
len_itr = len(g_list) |
|
|
|
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, |
|
|
|
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', |
|
|
|
n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) |
|
|
|
|
|
|
|
return kernel_list |
|
|
|
|
|
|
|
|
|
|
|
def _wrapper_kernel_list_do(self, itr): |
|
|
|
return itr, self.__kernel_do(G_g1, G_g_list[itr]) |
|
|
|
|
|
|
|
|
|
|
|
def _compute_single_kernel_series(self, g1, g2): |
|
|
|
self.__add_dummy_labels([g1] + [g2]) |
|
|
|
if self.__remove_totters: |
|
|
|
g1 = untotterTransformation(g1, self.__node_labels, self.__edge_labels) # @todo: this may not work. |
|
|
|
g2 = untotterTransformation(g2, self.__node_labels, self.__edge_labels) |
|
|
|
kernel = self.__kernel_do(g1, g2) |
|
|
|
return kernel |
|
|
|
|
|
|
|
|
|
|
|
def __kernel_do(self, g1, g2): |
|
|
|
"""Calculate marginalized graph kernel between 2 graphs. |
|
|
|
|
|
|
|
Parameters |
|
|
|
---------- |
|
|
|
g1, g2 : NetworkX graphs |
|
|
|
2 graphs between which the kernel is calculated. |
|
|
|
|
|
|
|
Return |
|
|
|
------ |
|
|
|
kernel : float |
|
|
|
Marginalized kernel between 2 graphs. |
|
|
|
""" |
|
|
|
# init parameters |
|
|
|
kernel = 0 |
|
|
|
num_nodes_G1 = nx.number_of_nodes(g1) |
|
|
|
num_nodes_G2 = nx.number_of_nodes(g2) |
|
|
|
# the initial probability distribution in the random walks generating step |
|
|
|
# (uniform distribution over |G|) |
|
|
|
p_init_G1 = 1 / num_nodes_G1 |
|
|
|
p_init_G2 = 1 / num_nodes_G2 |
|
|
|
|
|
|
|
q = self.__p_quit * self.__p_quit |
|
|
|
r1 = q |
|
|
|
|
|
|
|
# # initial R_inf |
|
|
|
# # matrix to save all the R_inf for all pairs of nodes |
|
|
|
# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) |
|
|
|
# |
|
|
|
# # calculate R_inf with a simple interative method |
|
|
|
# for i in range(1, n_iteration): |
|
|
|
# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) |
|
|
|
# R_inf_new.fill(r1) |
|
|
|
# |
|
|
|
# # calculate R_inf for each pair of nodes |
|
|
|
# for node1 in g1.nodes(data=True): |
|
|
|
# neighbor_n1 = g1[node1[0]] |
|
|
|
# # the transition probability distribution in the random walks |
|
|
|
# # generating step (uniform distribution over the vertices adjacent |
|
|
|
# # to the current vertex) |
|
|
|
# if len(neighbor_n1) > 0: |
|
|
|
# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) |
|
|
|
# for node2 in g2.nodes(data=True): |
|
|
|
# neighbor_n2 = g2[node2[0]] |
|
|
|
# if len(neighbor_n2) > 0: |
|
|
|
# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) |
|
|
|
# |
|
|
|
# for neighbor1 in neighbor_n1: |
|
|
|
# for neighbor2 in neighbor_n2: |
|
|
|
# t = p_trans_n1 * p_trans_n2 * \ |
|
|
|
# deltakernel(g1.node[neighbor1][node_label], |
|
|
|
# g2.node[neighbor2][node_label]) * \ |
|
|
|
# deltakernel( |
|
|
|
# neighbor_n1[neighbor1][edge_label], |
|
|
|
# neighbor_n2[neighbor2][edge_label]) |
|
|
|
# |
|
|
|
# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ |
|
|
|
# neighbor2] # ref [1] equation (8) |
|
|
|
# R_inf[:] = R_inf_new |
|
|
|
# |
|
|
|
# # add elements of R_inf up and calculate kernel |
|
|
|
# for node1 in g1.nodes(data=True): |
|
|
|
# for node2 in g2.nodes(data=True): |
|
|
|
# s = p_init_G1 * p_init_G2 * deltakernel( |
|
|
|
# node1[1][node_label], node2[1][node_label]) |
|
|
|
# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) |
|
|
|
|
|
|
|
|
|
|
|
R_inf = {} # dict to save all the R_inf for all pairs of nodes |
|
|
|
# initial R_inf, the 1st iteration. |
|
|
|
for node1 in g1.nodes(): |
|
|
|
for node2 in g2.nodes(): |
|
|
|
# R_inf[(node1[0], node2[0])] = r1 |
|
|
|
if len(g1[node1]) > 0: |
|
|
|
if len(g2[node2]) > 0: |
|
|
|
R_inf[(node1, node2)] = r1 |
|
|
|
else: |
|
|
|
R_inf[(node1, node2)] = self.__p_quit |
|
|
|
else: |
|
|
|
if len(g2[node2]) > 0: |
|
|
|
R_inf[(node1, node2)] = self.__p_quit |
|
|
|
else: |
|
|
|
R_inf[(node1, node2)] = 1 |
|
|
|
|
|
|
|
# compute all transition probability first. |
|
|
|
t_dict = {} |
|
|
|
if self.__n_iteration > 1: |
|
|
|
for node1 in g1.nodes(): |
|
|
|
neighbor_n1 = g1[node1] |
|
|
|
# the transition probability distribution in the random walks |
|
|
|
# generating step (uniform distribution over the vertices adjacent |
|
|
|
# to the current vertex) |
|
|
|
if len(neighbor_n1) > 0: |
|
|
|
p_trans_n1 = (1 - self.__p_quit) / len(neighbor_n1) |
|
|
|
for node2 in g2.nodes(): |
|
|
|
neighbor_n2 = g2[node2] |
|
|
|
if len(neighbor_n2) > 0: |
|
|
|
p_trans_n2 = (1 - self.__p_quit) / len(neighbor_n2) |
|
|
|
for neighbor1 in neighbor_n1: |
|
|
|
for neighbor2 in neighbor_n2: |
|
|
|
t_dict[(node1, node2, neighbor1, neighbor2)] = \ |
|
|
|
p_trans_n1 * p_trans_n2 * \ |
|
|
|
deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self.__node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self.__node_labels)) * \ |
|
|
|
deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self.__edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self.__edge_labels)) |
|
|
|
|
|
|
|
# calculate R_inf with a simple interative method |
|
|
|
for i in range(2, self.__n_iteration + 1): |
|
|
|
R_inf_old = R_inf.copy() |
|
|
|
|
|
|
|
# calculate R_inf for each pair of nodes |
|
|
|
for node1 in g1.nodes(): |
|
|
|
neighbor_n1 = g1[node1] |
|
|
|
# the transition probability distribution in the random walks |
|
|
|
# generating step (uniform distribution over the vertices adjacent |
|
|
|
# to the current vertex) |
|
|
|
if len(neighbor_n1) > 0: |
|
|
|
for node2 in g2.nodes(): |
|
|
|
neighbor_n2 = g2[node2] |
|
|
|
if len(neighbor_n2) > 0: |
|
|
|
R_inf[(node1, node2)] = r1 |
|
|
|
for neighbor1 in neighbor_n1: |
|
|
|
for neighbor2 in neighbor_n2: |
|
|
|
R_inf[(node1, node2)] += \ |
|
|
|
(t_dict[(node1, node2, neighbor1, neighbor2)] * \ |
|
|
|
R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) |
|
|
|
|
|
|
|
# add elements of R_inf up and calculate kernel |
|
|
|
for (n1, n2), value in R_inf.items(): |
|
|
|
s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self.__node_labels), tuple(g2.nodes[n2][nl] for nl in self.__node_labels)) |
|
|
|
kernel += s * value # ref [1] equation (6) |
|
|
|
|
|
|
|
return kernel |
|
|
|
|
|
|
|
|
|
|
|
def _wrapper_kernel_do(self, itr): |
|
|
|
i = itr[0] |
|
|
|
j = itr[1] |
|
|
|
return i, j, self.__kernel_do(G_gn[i], G_gn[j]) |
|
|
|
|
|
|
|
|
|
|
|
def _wrapper_untotter(self, i): |
|
|
|
return i, untotterTransformation(self._graphs[i], self.__node_labels, self.__edge_labels) # @todo: this may not work. |
|
|
|
|
|
|
|
|
|
|
|
def __add_dummy_labels(self, Gn): |
|
|
|
if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY): |
|
|
|
for i in range(len(Gn)): |
|
|
|
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) |
|
|
|
self.__node_labels = [SpecialLabel.DUMMY] |
|
|
|
if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY): |
|
|
|
for i in range(len(Gn)): |
|
|
|
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) |
|
|
|
self.__edge_labels = [SpecialLabel.DUMMY] |