diff --git a/lang/fr/gklearn/kernels/marginalizedKernel.py b/lang/fr/gklearn/kernels/marginalizedKernel.py new file mode 100644 index 0000000..950f1a6 --- /dev/null +++ b/lang/fr/gklearn/kernels/marginalizedKernel.py @@ -0,0 +1,307 @@ +""" +@author: linlin + +@references: + + [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between + labeled graphs. In Proceedings of the 20th International Conference on + Machine Learning, Washington, DC, United States, 2003. + + [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and + Jean-Philippe Vert. Extensions of marginalized graph kernels. In + Proceedings of the twenty-first international conference on Machine + learning, page 70. ACM, 2004. +""" + +import sys +import time +from functools import partial +from multiprocessing import Pool +from tqdm import tqdm +tqdm.monitor_interval = 0 +#import traceback + +import networkx as nx +import numpy as np + +from gklearn.utils.kernels import deltakernel +from gklearn.utils.utils import untotterTransformation +from gklearn.utils.graphdataset import get_dataset_attributes +from gklearn.utils.parallel import parallel_gm + + +def marginalizedkernel(*args, + node_label='atom', + edge_label='bond_type', + p_quit=0.5, + n_iteration=20, + remove_totters=False, + n_jobs=None, + chunksize=None, + verbose=True): + """Calculate marginalized graph kernels between graphs. + + Parameters + ---------- + Gn : List of NetworkX graph + List of graphs between which the kernels are calculated. + + G1, G2 : NetworkX graphs + Two graphs between which the kernel is calculated. + + node_label : string + Node attribute used as symbolic label. The default node label is 'atom'. + + edge_label : string + Edge attribute used as symbolic label. The default edge label is 'bond_type'. + + p_quit : integer + The termination probability in the random walks generating step. + + n_iteration : integer + Time of iterations to calculate R_inf. + + remove_totters : boolean + Whether to remove totterings by method introduced in [2]. The default + value is False. + + n_jobs : int + Number of jobs for parallelization. + + Return + ------ + Kmatrix : Numpy matrix + Kernel matrix, each element of which is the marginalized kernel between + 2 praphs. + """ + # pre-process + n_iteration = int(n_iteration) + Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] + Gn = [g.copy() for g in Gn] + + ds_attrs = get_dataset_attributes( + Gn, + attr_names=['node_labeled', 'edge_labeled', 'is_directed'], + node_label=node_label, edge_label=edge_label) + if not ds_attrs['node_labeled'] or node_label == None: + node_label = 'atom' + for G in Gn: + nx.set_node_attributes(G, '0', 'atom') + if not ds_attrs['edge_labeled'] or edge_label == None: + edge_label = 'bond_type' + for G in Gn: + nx.set_edge_attributes(G, '0', 'bond_type') + + start_time = time.time() + + if remove_totters: + # ---- use pool.imap_unordered to parallel and track progress. ---- + pool = Pool(n_jobs) + untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) + if chunksize is None: + if len(Gn) < 100 * n_jobs: + chunksize = int(len(Gn) / n_jobs) + 1 + else: + chunksize = 100 + for i, g in tqdm( + pool.imap_unordered( + untotter_partial, range(0, len(Gn)), chunksize), + desc='removing tottering', + file=sys.stdout): + Gn[i] = g + pool.close() + pool.join() + +# # ---- direct running, normally use single CPU core. ---- +# Gn = [ +# untotterTransformation(G, node_label, edge_label) +# for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) +# ] + + Kmatrix = np.zeros((len(Gn), len(Gn))) + + # ---- use pool.imap_unordered to parallel and track progress. ---- + def init_worker(gn_toshare): + global G_gn + G_gn = gn_toshare + do_partial = partial(wrapper_marg_do, node_label, edge_label, + p_quit, n_iteration) + parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, + glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) + + +# # ---- direct running, normally use single CPU core. ---- +## pbar = tqdm( +## total=(1 + len(Gn)) * len(Gn) / 2, +## desc='calculating kernels', +## file=sys.stdout) +# for i in range(0, len(Gn)): +# for j in range(i, len(Gn)): +## print(i, j) +# Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, +# edge_label, p_quit, n_iteration) +# Kmatrix[j][i] = Kmatrix[i][j] +## pbar.update(1) + + run_time = time.time() - start_time + if verbose: + print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" + % (len(Gn), run_time)) + + return Kmatrix, run_time + + +def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration): + """Calculate marginalized graph kernel between 2 graphs. + + Parameters + ---------- + G1, G2 : NetworkX graphs + 2 graphs between which the kernel is calculated. + node_label : string + node attribute used as label. + edge_label : string + edge attribute used as label. + p_quit : integer + the termination probability in the random walks generating step. + n_iteration : integer + time of iterations to calculate R_inf. + + Return + ------ + kernel : float + Marginalized Kernel between 2 graphs. + """ + # init parameters + kernel = 0 + num_nodes_G1 = nx.number_of_nodes(g1) + num_nodes_G2 = nx.number_of_nodes(g2) + # the initial probability distribution in the random walks generating step + # (uniform distribution over |G|) + p_init_G1 = 1 / num_nodes_G1 + p_init_G2 = 1 / num_nodes_G2 + + q = p_quit * p_quit + r1 = q + +# # initial R_inf +# # matrix to save all the R_inf for all pairs of nodes +# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) +# +# # calculate R_inf with a simple interative method +# for i in range(1, n_iteration): +# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) +# R_inf_new.fill(r1) +# +# # calculate R_inf for each pair of nodes +# for node1 in g1.nodes(data=True): +# neighbor_n1 = g1[node1[0]] +# # the transition probability distribution in the random walks +# # generating step (uniform distribution over the vertices adjacent +# # to the current vertex) +# if len(neighbor_n1) > 0: +# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) +# for node2 in g2.nodes(data=True): +# neighbor_n2 = g2[node2[0]] +# if len(neighbor_n2) > 0: +# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) +# +# for neighbor1 in neighbor_n1: +# for neighbor2 in neighbor_n2: +# t = p_trans_n1 * p_trans_n2 * \ +# deltakernel(g1.node[neighbor1][node_label], +# g2.node[neighbor2][node_label]) * \ +# deltakernel( +# neighbor_n1[neighbor1][edge_label], +# neighbor_n2[neighbor2][edge_label]) +# +# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ +# neighbor2] # ref [1] equation (8) +# R_inf[:] = R_inf_new +# +# # add elements of R_inf up and calculate kernel +# for node1 in g1.nodes(data=True): +# for node2 in g2.nodes(data=True): +# s = p_init_G1 * p_init_G2 * deltakernel( +# node1[1][node_label], node2[1][node_label]) +# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) + + + R_inf = {} # dict to save all the R_inf for all pairs of nodes + # initial R_inf, the 1st iteration. + for node1 in g1.nodes(): + for node2 in g2.nodes(): +# R_inf[(node1[0], node2[0])] = r1 + if len(g1[node1]) > 0: + if len(g2[node2]) > 0: + R_inf[(node1, node2)] = r1 + else: + R_inf[(node1, node2)] = p_quit + else: + if len(g2[node2]) > 0: + R_inf[(node1, node2)] = p_quit + else: + R_inf[(node1, node2)] = 1 + + # compute all transition probability first. + t_dict = {} + if n_iteration > 1: + for node1 in g1.nodes(): + neighbor_n1 = g1[node1] + # the transition probability distribution in the random walks + # generating step (uniform distribution over the vertices adjacent + # to the current vertex) + if len(neighbor_n1) > 0: + p_trans_n1 = (1 - p_quit) / len(neighbor_n1) + for node2 in g2.nodes(): + neighbor_n2 = g2[node2] + if len(neighbor_n2) > 0: + p_trans_n2 = (1 - p_quit) / len(neighbor_n2) + for neighbor1 in neighbor_n1: + for neighbor2 in neighbor_n2: + t_dict[(node1, node2, neighbor1, neighbor2)] = \ + p_trans_n1 * p_trans_n2 * \ + deltakernel(g1.nodes[neighbor1][node_label], + g2.nodes[neighbor2][node_label]) * \ + deltakernel( + neighbor_n1[neighbor1][edge_label], + neighbor_n2[neighbor2][edge_label]) + + # calculate R_inf with a simple interative method + for i in range(2, n_iteration + 1): + R_inf_old = R_inf.copy() + + # calculate R_inf for each pair of nodes + for node1 in g1.nodes(): + neighbor_n1 = g1[node1] + # the transition probability distribution in the random walks + # generating step (uniform distribution over the vertices adjacent + # to the current vertex) + if len(neighbor_n1) > 0: + for node2 in g2.nodes(): + neighbor_n2 = g2[node2] + if len(neighbor_n2) > 0: + R_inf[(node1, node2)] = r1 + for neighbor1 in neighbor_n1: + for neighbor2 in neighbor_n2: + R_inf[(node1, node2)] += \ + (t_dict[(node1, node2, neighbor1, neighbor2)] * \ + R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) + + # add elements of R_inf up and calculate kernel + for (n1, n2), value in R_inf.items(): + s = p_init_G1 * p_init_G2 * deltakernel( + g1.nodes[n1][node_label], g2.nodes[n2][node_label]) + kernel += s * value # ref [1] equation (6) + + return kernel + + +def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr): + i= itr[0] + j = itr[1] + return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration) + + +def wrapper_untotter(Gn, node_label, edge_label, i): + return i, untotterTransformation(Gn[i], node_label, edge_label)