|
- """
- @author: linlin
- @references:
- [1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between
- labeled graphs. In Proceedings of the 20th International Conference on
- Machine Learning, Washington, DC, United States, 2003.
- [2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and
- Jean-Philippe Vert. Extensions of marginalized graph kernels. In
- Proceedings of the twenty-first international conference on Machine
- learning, page 70. ACM, 2004.
- """
-
- import sys
- import time
- from functools import partial
- from multiprocessing import Pool
- from tqdm import tqdm
- tqdm.monitor_interval = 0
- #import traceback
-
- import networkx as nx
- import numpy as np
-
- from pygraph.utils.kernels import deltakernel
- from pygraph.utils.utils import untotterTransformation
- from pygraph.utils.graphdataset import get_dataset_attributes
- from pygraph.utils.parallel import parallel_gm
- sys.path.insert(0, "../")
-
-
- def marginalizedkernel(*args,
- node_label='atom',
- edge_label='bond_type',
- p_quit=0.5,
- n_iteration=20,
- remove_totters=False,
- n_jobs=None,
- verbose=True):
- """Calculate marginalized graph kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- /
- G1, G2 : NetworkX graphs
- Two graphs between which the kernel is calculated.
- node_label : string
- Node attribute used as symbolic label. The default node label is 'atom'.
- edge_label : string
- Edge attribute used as symbolic label. The default edge label is 'bond_type'.
- p_quit : integer
- The termination probability in the random walks generating step.
- n_iteration : integer
- Time of iterations to calculate R_inf.
- remove_totters : boolean
- Whether to remove totterings by method introduced in [2]. The default
- value is False.
- n_jobs : int
- Number of jobs for parallelization.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the marginalized kernel between
- 2 praphs.
- """
- # pre-process
- n_iteration = int(n_iteration)
- Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
- Gn = [g.copy() for g in Gn]
-
- ds_attrs = get_dataset_attributes(
- Gn,
- attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
- node_label=node_label, edge_label=edge_label)
- if not ds_attrs['node_labeled'] or node_label == None:
- node_label = 'atom'
- for G in Gn:
- nx.set_node_attributes(G, '0', 'atom')
- if not ds_attrs['edge_labeled'] or edge_label == None:
- edge_label = 'bond_type'
- for G in Gn:
- nx.set_edge_attributes(G, '0', 'bond_type')
-
- start_time = time.time()
-
- if remove_totters:
- # ---- use pool.imap_unordered to parallel and track progress. ----
- pool = Pool(n_jobs)
- untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
- if len(Gn) < 100 * n_jobs:
- chunksize = int(len(Gn) / n_jobs) + 1
- else:
- chunksize = 100
- for i, g in tqdm(
- pool.imap_unordered(
- untotter_partial, range(0, len(Gn)), chunksize),
- desc='removing tottering',
- file=sys.stdout):
- Gn[i] = g
- pool.close()
- pool.join()
-
- # # ---- direct running, normally use single CPU core. ----
- # Gn = [
- # untotterTransformation(G, node_label, edge_label)
- # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
- # ]
-
- Kmatrix = np.zeros((len(Gn), len(Gn)))
-
- # ---- use pool.imap_unordered to parallel and track progress. ----
- def init_worker(gn_toshare):
- global G_gn
- G_gn = gn_toshare
- do_partial = partial(wrapper_marg_do, node_label, edge_label,
- p_quit, n_iteration)
- parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
- glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
-
-
- # # ---- direct running, normally use single CPU core. ----
- ## pbar = tqdm(
- ## total=(1 + len(Gn)) * len(Gn) / 2,
- ## desc='calculating kernels',
- ## file=sys.stdout)
- # for i in range(0, len(Gn)):
- # for j in range(i, len(Gn)):
- ## print(i, j)
- # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
- # edge_label, p_quit, n_iteration)
- # Kmatrix[j][i] = Kmatrix[i][j]
- ## pbar.update(1)
-
- run_time = time.time() - start_time
- if verbose:
- print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
- % (len(Gn), run_time))
-
- return Kmatrix, run_time
-
-
- def _marginalizedkernel_do(g1, g2, node_label, edge_label, p_quit, n_iteration):
- """Calculate marginalized graph kernel between 2 graphs.
-
- Parameters
- ----------
- G1, G2 : NetworkX graphs
- 2 graphs between which the kernel is calculated.
- node_label : string
- node attribute used as label.
- edge_label : string
- edge attribute used as label.
- p_quit : integer
- the termination probability in the random walks generating step.
- n_iteration : integer
- time of iterations to calculate R_inf.
-
- Return
- ------
- kernel : float
- Marginalized Kernel between 2 graphs.
- """
- # init parameters
- kernel = 0
- num_nodes_G1 = nx.number_of_nodes(g1)
- num_nodes_G2 = nx.number_of_nodes(g2)
- # the initial probability distribution in the random walks generating step
- # (uniform distribution over |G|)
- p_init_G1 = 1 / num_nodes_G1
- p_init_G2 = 1 / num_nodes_G2
-
- q = p_quit * p_quit
- r1 = q
-
- # # initial R_inf
- # # matrix to save all the R_inf for all pairs of nodes
- # R_inf = np.zeros([num_nodes_G1, num_nodes_G2])
- #
- # # calculate R_inf with a simple interative method
- # for i in range(1, n_iteration):
- # R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2])
- # R_inf_new.fill(r1)
- #
- # # calculate R_inf for each pair of nodes
- # for node1 in g1.nodes(data=True):
- # neighbor_n1 = g1[node1[0]]
- # # the transition probability distribution in the random walks
- # # generating step (uniform distribution over the vertices adjacent
- # # to the current vertex)
- # if len(neighbor_n1) > 0:
- # p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
- # for node2 in g2.nodes(data=True):
- # neighbor_n2 = g2[node2[0]]
- # if len(neighbor_n2) > 0:
- # p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
- #
- # for neighbor1 in neighbor_n1:
- # for neighbor2 in neighbor_n2:
- # t = p_trans_n1 * p_trans_n2 * \
- # deltakernel(g1.node[neighbor1][node_label],
- # g2.node[neighbor2][node_label]) * \
- # deltakernel(
- # neighbor_n1[neighbor1][edge_label],
- # neighbor_n2[neighbor2][edge_label])
- #
- # R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][
- # neighbor2] # ref [1] equation (8)
- # R_inf[:] = R_inf_new
- #
- # # add elements of R_inf up and calculate kernel
- # for node1 in g1.nodes(data=True):
- # for node2 in g2.nodes(data=True):
- # s = p_init_G1 * p_init_G2 * deltakernel(
- # node1[1][node_label], node2[1][node_label])
- # kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6)
-
-
- R_inf = {} # dict to save all the R_inf for all pairs of nodes
- # initial R_inf, the 1st iteration.
- for node1 in g1.nodes():
- for node2 in g2.nodes():
- # R_inf[(node1[0], node2[0])] = r1
- if len(g1[node1]) > 0:
- if len(g2[node2]) > 0:
- R_inf[(node1, node2)] = r1
- else:
- R_inf[(node1, node2)] = p_quit
- else:
- if len(g2[node2]) > 0:
- R_inf[(node1, node2)] = p_quit
- else:
- R_inf[(node1, node2)] = 1
-
- # compute all transition probability first.
- t_dict = {}
- if n_iteration > 1:
- for node1 in g1.nodes():
- neighbor_n1 = g1[node1]
- # the transition probability distribution in the random walks
- # generating step (uniform distribution over the vertices adjacent
- # to the current vertex)
- if len(neighbor_n1) > 0:
- p_trans_n1 = (1 - p_quit) / len(neighbor_n1)
- for node2 in g2.nodes():
- neighbor_n2 = g2[node2]
- if len(neighbor_n2) > 0:
- p_trans_n2 = (1 - p_quit) / len(neighbor_n2)
- for neighbor1 in neighbor_n1:
- for neighbor2 in neighbor_n2:
- t_dict[(node1, node2, neighbor1, neighbor2)] = \
- p_trans_n1 * p_trans_n2 * \
- deltakernel(g1.node[neighbor1][node_label],
- g2.node[neighbor2][node_label]) * \
- deltakernel(
- neighbor_n1[neighbor1][edge_label],
- neighbor_n2[neighbor2][edge_label])
-
- # calculate R_inf with a simple interative method
- for i in range(2, n_iteration + 1):
- R_inf_old = R_inf.copy()
-
- # calculate R_inf for each pair of nodes
- for node1 in g1.nodes():
- neighbor_n1 = g1[node1]
- # the transition probability distribution in the random walks
- # generating step (uniform distribution over the vertices adjacent
- # to the current vertex)
- if len(neighbor_n1) > 0:
- for node2 in g2.nodes():
- neighbor_n2 = g2[node2]
- if len(neighbor_n2) > 0:
- R_inf[(node1, node2)] = r1
- for neighbor1 in neighbor_n1:
- for neighbor2 in neighbor_n2:
- R_inf[(node1, node2)] += \
- (t_dict[(node1, node2, neighbor1, neighbor2)] * \
- R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8)
-
- # add elements of R_inf up and calculate kernel
- for (n1, n2), value in R_inf.items():
- s = p_init_G1 * p_init_G2 * deltakernel(
- g1.nodes[n1][node_label], g2.nodes[n2][node_label])
- kernel += s * value # ref [1] equation (6)
-
- return kernel
-
-
- def wrapper_marg_do(node_label, edge_label, p_quit, n_iteration, itr):
- i= itr[0]
- j = itr[1]
- return i, j, _marginalizedkernel_do(G_gn[i], G_gn[j], node_label, edge_label, p_quit, n_iteration)
-
-
- def wrapper_untotter(Gn, node_label, edge_label, i):
- return i, untotterTransformation(Gn[i], node_label, edge_label)
|