You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

randomWalkKernel.py 41 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907
  1. """
  2. @author: linlin
  3. @references:
  4. [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
  5. """
  6. import time
  7. from functools import partial
  8. from tqdm import tqdm
  9. import networkx as nx
  10. import numpy as np
  11. from scipy.sparse import identity, kron
  12. from scipy.sparse.linalg import cg
  13. from scipy.optimize import fixed_point
  14. from gklearn.utils.graphdataset import get_dataset_attributes
  15. from gklearn.utils.parallel import parallel_gm
  16. def randomwalkkernel(*args,
  17. # params for all method.
  18. compute_method=None,
  19. weight=1,
  20. p=None,
  21. q=None,
  22. edge_weight=None,
  23. # params for conjugate and fp method.
  24. node_kernels=None,
  25. edge_kernels=None,
  26. node_label='atom',
  27. edge_label='bond_type',
  28. # params for spectral method.
  29. sub_kernel=None,
  30. n_jobs=None,
  31. verbose=True):
  32. """Calculate random walk graph kernels.
  33. Parameters
  34. ----------
  35. Gn : List of NetworkX graph
  36. List of graphs between which the kernels are calculated.
  37. G1, G2 : NetworkX graphs
  38. Two graphs between which the kernel is calculated.
  39. compute_method : string
  40. Method used to compute kernel. The Following choices are
  41. available:
  42. 'sylvester' - Sylvester equation method.
  43. 'conjugate' - conjugate gradient method.
  44. 'fp' - fixed-point iterations.
  45. 'spectral' - spectral decomposition.
  46. weight : float
  47. A constant weight set for random walks of length h.
  48. p : None
  49. Initial probability distribution on the unlabeled direct product graph
  50. of two graphs. It is set to be uniform over all vertices in the direct
  51. product graph.
  52. q : None
  53. Stopping probability distribution on the unlabeled direct product graph
  54. of two graphs. It is set to be uniform over all vertices in the direct
  55. product graph.
  56. edge_weight : float
  57. Edge attribute name corresponding to the edge weight.
  58. node_kernels: dict
  59. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  60. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  61. for both labels. The first 2 functions take two node labels as
  62. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  63. non-symbolic label for each the two nodes. Each label is in form of 2-D
  64. dimension array (n_samples, n_features). Each function returns a number
  65. as the kernel value. Ignored when nodes are unlabeled. This argument
  66. is designated to conjugate gradient method and fixed-point iterations.
  67. edge_kernels: dict
  68. A dictionary of kernel functions for edges, including 3 items: 'symb'
  69. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  70. for both labels. The first 2 functions take two edge labels as
  71. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  72. non-symbolic label for each the two edges. Each label is in form of 2-D
  73. dimension array (n_samples, n_features). Each function returns a number
  74. as the kernel value. Ignored when edges are unlabeled. This argument
  75. is designated to conjugate gradient method and fixed-point iterations.
  76. node_label: string
  77. Node attribute used as label. The default node label is atom. This
  78. argument is designated to conjugate gradient method and fixed-point
  79. iterations.
  80. edge_label : string
  81. Edge attribute used as label. The default edge label is bond_type. This
  82. argument is designated to conjugate gradient method and fixed-point
  83. iterations.
  84. sub_kernel: string
  85. Method used to compute walk kernel. The Following choices are
  86. available:
  87. 'exp' : method based on exponential serials.
  88. 'geo' : method based on geometric serials.
  89. n_jobs: int
  90. Number of jobs for parallelization.
  91. Return
  92. ------
  93. Kmatrix : Numpy matrix
  94. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  95. """
  96. compute_method = compute_method.lower()
  97. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  98. Gn = [g.copy() for g in Gn]
  99. eweight = None
  100. if edge_weight == None:
  101. if verbose:
  102. print('\n None edge weight specified. Set all weight to 1.\n')
  103. else:
  104. try:
  105. some_weight = list(
  106. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  107. if isinstance(some_weight, float) or isinstance(some_weight, int):
  108. eweight = edge_weight
  109. else:
  110. if verbose:
  111. print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  112. % edge_weight)
  113. except:
  114. if verbose:
  115. print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  116. % edge_weight)
  117. ds_attrs = get_dataset_attributes(
  118. Gn,
  119. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  120. 'edge_attr_dim', 'is_directed'],
  121. node_label=node_label,
  122. edge_label=edge_label)
  123. # remove graphs with no edges, as no walk can be found in their structures,
  124. # so the weight matrix between such a graph and itself might be zero.
  125. len_gn = len(Gn)
  126. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  127. idx = [G[0] for G in Gn]
  128. Gn = [G[1] for G in Gn]
  129. if len(Gn) != len_gn:
  130. if verbose:
  131. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  132. (len_gn - len(Gn)))
  133. start_time = time.time()
  134. # # get vertex and edge concatenated labels for each graph
  135. # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
  136. # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])
  137. if compute_method == 'sylvester':
  138. if verbose:
  139. import warnings
  140. warnings.warn('All labels are ignored.')
  141. Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose)
  142. elif compute_method == 'conjugate':
  143. Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels,
  144. edge_kernels, node_label, edge_label,
  145. eweight, n_jobs, verbose=verbose)
  146. elif compute_method == 'fp':
  147. Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels,
  148. edge_kernels, node_label, edge_label,
  149. eweight, n_jobs, verbose=verbose)
  150. elif compute_method == 'spectral':
  151. if verbose:
  152. import warnings
  153. warnings.warn('All labels are ignored. Only works for undirected graphs.')
  154. Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel,
  155. eweight, n_jobs, verbose=verbose)
  156. elif compute_method == 'kron':
  157. pass
  158. for i in range(0, len(Gn)):
  159. for j in range(i, len(Gn)):
  160. Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
  161. node_label, edge_label)
  162. Kmatrix[j][i] = Kmatrix[i][j]
  163. else:
  164. raise Exception(
  165. 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
  166. )
  167. run_time = time.time() - start_time
  168. if verbose:
  169. print("\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
  170. % (len(Gn), run_time))
  171. return Kmatrix, run_time, idx
  172. ###############################################################################
  173. def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):
  174. """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.
  175. Parameters
  176. ----------
  177. G1, G2 : NetworkX graph
  178. Graphs between which the kernel is calculated.
  179. node_label : string
  180. node attribute used as label.
  181. edge_label : string
  182. edge attribute used as label.
  183. Return
  184. ------
  185. kernel : float
  186. Kernel between 2 graphs.
  187. """
  188. Kmatrix = np.zeros((len(Gn), len(Gn)))
  189. if q == None:
  190. # don't normalize adjacency matrices if q is a uniform vector. Note
  191. # A_wave_list actually contains the transposes of the adjacency matrices.
  192. A_wave_list = [
  193. nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  194. (tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) if
  195. verbose else Gn)
  196. ]
  197. # # normalized adjacency matrices
  198. # A_wave_list = []
  199. # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
  200. # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
  201. # norm = A_tilde.sum(axis=0)
  202. # norm[norm == 0] = 1
  203. # A_wave_list.append(A_tilde / norm)
  204. if p == None: # p is uniform distribution as default.
  205. def init_worker(Awl_toshare):
  206. global G_Awl
  207. G_Awl = Awl_toshare
  208. do_partial = partial(wrapper_se_do, lmda)
  209. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  210. glbv=(A_wave_list,), n_jobs=n_jobs, verbose=verbose)
  211. # pbar = tqdm(
  212. # total=(1 + len(Gn)) * len(Gn) / 2,
  213. # desc='calculating kernels',
  214. # file=sys.stdout)
  215. # for i in range(0, len(Gn)):
  216. # for j in range(i, len(Gn)):
  217. # S = lmda * A_wave_list[j]
  218. # T_t = A_wave_list[i]
  219. # # use uniform distribution if there is no prior knowledge.
  220. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  221. # p_times_uni = 1 / nb_pd
  222. # M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
  223. # X = dlyap(S, T_t, M0)
  224. # X = np.reshape(X, (-1, 1), order='F')
  225. # # use uniform distribution if there is no prior knowledge.
  226. # q_times = np.full((1, nb_pd), p_times_uni)
  227. # Kmatrix[i][j] = np.dot(q_times, X)
  228. # Kmatrix[j][i] = Kmatrix[i][j]
  229. # pbar.update(1)
  230. return Kmatrix
  231. def wrapper_se_do(lmda, itr):
  232. i = itr[0]
  233. j = itr[1]
  234. return i, j, _se_do(G_Awl[i], G_Awl[j], lmda)
  235. def _se_do(A_wave1, A_wave2, lmda):
  236. from control import dlyap
  237. S = lmda * A_wave2
  238. T_t = A_wave1
  239. # use uniform distribution if there is no prior knowledge.
  240. nb_pd = len(A_wave1) * len(A_wave2)
  241. p_times_uni = 1 / nb_pd
  242. M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni)
  243. X = dlyap(S, T_t, M0)
  244. X = np.reshape(X, (-1, 1), order='F')
  245. # use uniform distribution if there is no prior knowledge.
  246. q_times = np.full((1, nb_pd), p_times_uni)
  247. return np.dot(q_times, X)
  248. ###############################################################################
  249. def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  250. node_label, edge_label, eweight, n_jobs, verbose=True):
  251. """Calculate walk graph kernels up to n between 2 graphs using conjugate method.
  252. Parameters
  253. ----------
  254. G1, G2 : NetworkX graph
  255. Graphs between which the kernel is calculated.
  256. node_label : string
  257. node attribute used as label.
  258. edge_label : string
  259. edge attribute used as label.
  260. Return
  261. ------
  262. kernel : float
  263. Kernel between 2 graphs.
  264. """
  265. Kmatrix = np.zeros((len(Gn), len(Gn)))
  266. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  267. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
  268. # # this is faster from unlabeled graphs. @todo: why?
  269. # if q == None:
  270. # # don't normalize adjacency matrices if q is a uniform vector. Note
  271. # # A_wave_list actually contains the transposes of the adjacency matrices.
  272. # A_wave_list = [
  273. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  274. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  275. # ]
  276. # if p == None: # p is uniform distribution as default.
  277. # def init_worker(Awl_toshare):
  278. # global G_Awl
  279. # G_Awl = Awl_toshare
  280. # do_partial = partial(wrapper_cg_unlabled_do, lmda)
  281. # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  282. # glbv=(A_wave_list,), n_jobs=n_jobs)
  283. # else:
  284. # reindex nodes using consecutive integers for convenience of kernel calculation.
  285. Gn = [nx.convert_node_labels_to_integers(
  286. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  287. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  288. if p == None and q == None: # p and q are uniform distributions as default.
  289. def init_worker(gn_toshare):
  290. global G_gn
  291. G_gn = gn_toshare
  292. do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
  293. node_label, edge_kernels, edge_label, lmda)
  294. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  295. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  296. # pbar = tqdm(
  297. # total=(1 + len(Gn)) * len(Gn) / 2,
  298. # desc='calculating kernels',
  299. # file=sys.stdout)
  300. # for i in range(0, len(Gn)):
  301. # for j in range(i, len(Gn)):
  302. # result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
  303. # node_label, edge_kernels, edge_label, lmda)
  304. # Kmatrix[i][j] = result
  305. # Kmatrix[j][i] = Kmatrix[i][j]
  306. # pbar.update(1)
  307. return Kmatrix
  308. def wrapper_cg_unlabled_do(lmda, itr):
  309. i = itr[0]
  310. j = itr[1]
  311. return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda)
  312. def _cg_unlabled_do(A_wave1, A_wave2, lmda):
  313. nb_pd = len(A_wave1) * len(A_wave2)
  314. p_times_uni = 1 / nb_pd
  315. w_times = kron(A_wave1, A_wave2).todense()
  316. A = identity(w_times.shape[0]) - w_times * lmda
  317. b = np.full((nb_pd, 1), p_times_uni)
  318. x, _ = cg(A, b)
  319. # use uniform distribution if there is no prior knowledge.
  320. q_times = np.full((1, nb_pd), p_times_uni)
  321. return np.dot(q_times, x)
  322. def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  323. edge_label, lmda, itr):
  324. i = itr[0]
  325. j = itr[1]
  326. return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  327. node_label, edge_kernels, edge_label, lmda)
  328. def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  329. edge_kernels, edge_label, lmda):
  330. # Frist, compute kernels between all pairs of nodes, method borrowed
  331. # from FCSP. It is faster than directly computing all edge kernels
  332. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  333. # graphs compared, which is the most case we went though. For very
  334. # sparse graphs, this would be slow.
  335. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  336. # Compute weight matrix of the direct product graph.
  337. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  338. edge_kernels, edge_label)
  339. # use uniform distribution if there is no prior knowledge.
  340. p_times_uni = 1 / w_dim
  341. A = identity(w_times.shape[0]) - w_times * lmda
  342. b = np.full((w_dim, 1), p_times_uni)
  343. x, _ = cg(A, b)
  344. # use uniform distribution if there is no prior knowledge.
  345. q_times = np.full((1, w_dim), p_times_uni)
  346. return np.dot(q_times, x)
  347. ###############################################################################
  348. def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  349. node_label, edge_label, eweight, n_jobs, verbose=True):
  350. """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.
  351. Parameters
  352. ----------
  353. G1, G2 : NetworkX graph
  354. Graphs between which the kernel is calculated.
  355. node_label : string
  356. node attribute used as label.
  357. edge_label : string
  358. edge attribute used as label.
  359. Return
  360. ------
  361. kernel : float
  362. Kernel between 2 graphs.
  363. """
  364. Kmatrix = np.zeros((len(Gn), len(Gn)))
  365. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  366. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
  367. # # this is faster from unlabeled graphs. @todo: why?
  368. # if q == None:
  369. # # don't normalize adjacency matrices if q is a uniform vector. Note
  370. # # A_wave_list actually contains the transposes of the adjacency matrices.
  371. # A_wave_list = [
  372. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  373. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  374. # ]
  375. # if p == None: # p is uniform distribution as default.
  376. # pbar = tqdm(
  377. # total=(1 + len(Gn)) * len(Gn) / 2,
  378. # desc='calculating kernels',
  379. # file=sys.stdout)
  380. # for i in range(0, len(Gn)):
  381. # for j in range(i, len(Gn)):
  382. # # use uniform distribution if there is no prior knowledge.
  383. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  384. # p_times_uni = 1 / nb_pd
  385. # w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
  386. # p_times = np.full((nb_pd, 1), p_times_uni)
  387. # x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
  388. # # use uniform distribution if there is no prior knowledge.
  389. # q_times = np.full((1, nb_pd), p_times_uni)
  390. # Kmatrix[i][j] = np.dot(q_times, x)
  391. # Kmatrix[j][i] = Kmatrix[i][j]
  392. # pbar.update(1)
  393. # else:
  394. # reindex nodes using consecutive integers for convenience of kernel calculation.
  395. Gn = [nx.convert_node_labels_to_integers(
  396. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  397. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  398. if p == None and q == None: # p and q are uniform distributions as default.
  399. def init_worker(gn_toshare):
  400. global G_gn
  401. G_gn = gn_toshare
  402. do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
  403. node_label, edge_kernels, edge_label, lmda)
  404. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  405. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  406. return Kmatrix
  407. def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  408. edge_label, lmda, itr):
  409. i = itr[0]
  410. j = itr[1]
  411. return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  412. node_label, edge_kernels, edge_label, lmda)
  413. def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  414. edge_kernels, edge_label, lmda):
  415. # Frist, compute kernels between all pairs of nodes, method borrowed
  416. # from FCSP. It is faster than directly computing all edge kernels
  417. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  418. # graphs compared, which is the most case we went though. For very
  419. # sparse graphs, this would be slow.
  420. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  421. # Compute weight matrix of the direct product graph.
  422. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  423. edge_kernels, edge_label)
  424. # use uniform distribution if there is no prior knowledge.
  425. p_times_uni = 1 / w_dim
  426. p_times = np.full((w_dim, 1), p_times_uni)
  427. x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times),
  428. xtol=1e-06, maxiter=1000)
  429. # use uniform distribution if there is no prior knowledge.
  430. q_times = np.full((1, w_dim), p_times_uni)
  431. return np.dot(q_times, x)
  432. def func_fp(x, p_times, lmda, w_times):
  433. haha = w_times * x
  434. haha = lmda * haha
  435. haha = p_times + haha
  436. return p_times + lmda * np.dot(w_times, x)
  437. ###############################################################################
  438. def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True):
  439. """Calculate walk graph kernels up to n between 2 unlabeled graphs using
  440. spectral decomposition method. Labels will be ignored.
  441. Parameters
  442. ----------
  443. G1, G2 : NetworkX graph
  444. Graphs between which the kernel is calculated.
  445. node_label : string
  446. node attribute used as label.
  447. edge_label : string
  448. edge attribute used as label.
  449. Return
  450. ------
  451. kernel : float
  452. Kernel between 2 graphs.
  453. """
  454. Kmatrix = np.zeros((len(Gn), len(Gn)))
  455. if q == None:
  456. # precompute the spectral decomposition of each graph.
  457. P_list = []
  458. D_list = []
  459. for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if
  460. verbose else Gn):
  461. # don't normalize adjacency matrices if q is a uniform vector. Note
  462. # A actually is the transpose of the adjacency matrix.
  463. A = nx.adjacency_matrix(G, eweight).todense().transpose()
  464. ew, ev = np.linalg.eig(A)
  465. D_list.append(ew)
  466. P_list.append(ev)
  467. # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?
  468. if p == None: # p is uniform distribution as default.
  469. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn]
  470. # q_T_list = [q.T for q in q_list]
  471. def init_worker(q_T_toshare, P_toshare, D_toshare):
  472. global G_q_T, G_P, G_D
  473. G_q_T = q_T_toshare
  474. G_P = P_toshare
  475. G_D = D_toshare
  476. do_partial = partial(wrapper_sd_do, weight, sub_kernel)
  477. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  478. glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs,
  479. verbose=verbose)
  480. # pbar = tqdm(
  481. # total=(1 + len(Gn)) * len(Gn) / 2,
  482. # desc='calculating kernels',
  483. # file=sys.stdout)
  484. # for i in range(0, len(Gn)):
  485. # for j in range(i, len(Gn)):
  486. # result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
  487. # D_list[i], D_list[j], weight, sub_kernel)
  488. # Kmatrix[i][j] = result
  489. # Kmatrix[j][i] = Kmatrix[i][j]
  490. # pbar.update(1)
  491. return Kmatrix
  492. def wrapper_sd_do(weight, sub_kernel, itr):
  493. i = itr[0]
  494. j = itr[1]
  495. return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j],
  496. weight, sub_kernel)
  497. def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):
  498. # use uniform distribution if there is no prior knowledge.
  499. kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
  500. # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs
  501. # kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense()
  502. if sub_kernel == 'exp':
  503. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  504. kmiddle = np.diag(np.exp(weight * D_diag))
  505. elif sub_kernel == 'geo':
  506. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  507. kmiddle = np.diag(weight * D_diag)
  508. kmiddle = np.identity(len(kmiddle)) - weight * kmiddle
  509. kmiddle = np.linalg.inv(kmiddle)
  510. return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]
  511. ###############################################################################
  512. def _randomwalkkernel_kron(G1, G2, node_label, edge_label):
  513. """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method.
  514. Parameters
  515. ----------
  516. G1, G2 : NetworkX graph
  517. Graphs between which the kernel is calculated.
  518. node_label : string
  519. node attribute used as label.
  520. edge_label : string
  521. edge attribute used as label.
  522. Return
  523. ------
  524. kernel : float
  525. Kernel between 2 graphs.
  526. """
  527. pass
  528. ###############################################################################
  529. def getLabels(Gn, node_label, edge_label, directed):
  530. """Get symbolic labels of a graph dataset, where vertex labels are dealt
  531. with by concatenating them to the edge labels of adjacent edges.
  532. """
  533. label_list = []
  534. label_set = set()
  535. for g in Gn:
  536. label_g = {}
  537. for e in g.edges(data=True):
  538. nl1 = g.node[e[0]][node_label]
  539. nl2 = g.node[e[1]][node_label]
  540. if not directed and nl1 > nl2:
  541. nl1, nl2 = nl2, nl1
  542. label = (nl1, e[2][edge_label], nl2)
  543. label_g[(e[0], e[1])] = label
  544. label_list.append(label_g)
  545. label_set = set([l for lg in label_list for l in lg.values()])
  546. return label_list, len(label_set)
  547. def filterGramMatrix(gmt, label_dict, label, directed):
  548. """Compute (the transpose of) the Gram matrix filtered by a label.
  549. """
  550. gmf = np.zeros(gmt.shape)
  551. for (n1, n2), l in label_dict.items():
  552. if l == label:
  553. gmf[n2, n1] = gmt[n2, n1]
  554. if not directed:
  555. gmf[n1, n2] = gmt[n1, n2]
  556. return gmf
  557. def computeVK(g1, g2, ds_attrs, node_kernels, node_label):
  558. '''Compute vertex kernels between vertices of two graphs.
  559. '''
  560. vk_dict = {} # shortest path matrices dict
  561. if ds_attrs['node_labeled']:
  562. # node symb and non-synb labeled
  563. if ds_attrs['node_attr_dim'] > 0:
  564. kn = node_kernels['mix']
  565. for n1 in g1.nodes(data=True):
  566. for n2 in g2.nodes(data=True):
  567. vk_dict[(n1[0], n2[0])] = kn(
  568. n1[1][node_label], n2[1][node_label],
  569. n1[1]['attributes'], n2[1]['attributes'])
  570. # node symb labeled
  571. else:
  572. kn = node_kernels['symb']
  573. for n1 in g1.nodes(data=True):
  574. for n2 in g2.nodes(data=True):
  575. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  576. n2[1][node_label])
  577. else:
  578. # node non-synb labeled
  579. if ds_attrs['node_attr_dim'] > 0:
  580. kn = node_kernels['nsymb']
  581. for n1 in g1.nodes(data=True):
  582. for n2 in g2.nodes(data=True):
  583. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  584. n2[1]['attributes'])
  585. # node unlabeled
  586. else:
  587. pass
  588. return vk_dict
  589. def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):
  590. '''Compute weight matrix of the direct product graph.
  591. '''
  592. w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
  593. w_times = np.zeros((w_dim, w_dim))
  594. if vk_dict: # node labeled
  595. if ds_attrs['is_directed']:
  596. if ds_attrs['edge_labeled']:
  597. # edge symb and non-synb labeled
  598. if ds_attrs['edge_attr_dim'] > 0:
  599. ke = edge_kernels['mix']
  600. for e1 in g1.edges(data=True):
  601. for e2 in g2.edges(data=True):
  602. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  603. e1[2]['attributes'], e2[2]['attributes'])
  604. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  605. e1[1] * nx.number_of_nodes(g2) + e2[1])
  606. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  607. * ek_temp * vk_dict[(e1[1], e2[1])]
  608. # edge symb labeled
  609. else:
  610. ke = edge_kernels['symb']
  611. for e1 in g1.edges(data=True):
  612. for e2 in g2.edges(data=True):
  613. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  614. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  615. e1[1] * nx.number_of_nodes(g2) + e2[1])
  616. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  617. * ek_temp * vk_dict[(e1[1], e2[1])]
  618. else:
  619. # edge non-synb labeled
  620. if ds_attrs['edge_attr_dim'] > 0:
  621. ke = edge_kernels['nsymb']
  622. for e1 in g1.edges(data=True):
  623. for e2 in g2.edges(data=True):
  624. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  625. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  626. e1[1] * nx.number_of_nodes(g2) + e2[1])
  627. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  628. * ek_temp * vk_dict[(e1[1], e2[1])]
  629. # edge unlabeled
  630. else:
  631. for e1 in g1.edges(data=True):
  632. for e2 in g2.edges(data=True):
  633. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  634. e1[1] * nx.number_of_nodes(g2) + e2[1])
  635. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  636. * vk_dict[(e1[1], e2[1])]
  637. else: # undirected
  638. if ds_attrs['edge_labeled']:
  639. # edge symb and non-synb labeled
  640. if ds_attrs['edge_attr_dim'] > 0:
  641. ke = edge_kernels['mix']
  642. for e1 in g1.edges(data=True):
  643. for e2 in g2.edges(data=True):
  644. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  645. e1[2]['attributes'], e2[2]['attributes'])
  646. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  647. e1[1] * nx.number_of_nodes(g2) + e2[1])
  648. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  649. * ek_temp * vk_dict[(e1[1], e2[1])] \
  650. + vk_dict[(e1[0], e2[1])] \
  651. * ek_temp * vk_dict[(e1[1], e2[0])]
  652. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  653. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  654. e1[1] * nx.number_of_nodes(g2) + e2[0])
  655. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  656. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  657. # edge symb labeled
  658. else:
  659. ke = edge_kernels['symb']
  660. for e1 in g1.edges(data=True):
  661. for e2 in g2.edges(data=True):
  662. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  663. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  664. e1[1] * nx.number_of_nodes(g2) + e2[1])
  665. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  666. * ek_temp * vk_dict[(e1[1], e2[1])] \
  667. + vk_dict[(e1[0], e2[1])] \
  668. * ek_temp * vk_dict[(e1[1], e2[0])]
  669. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  670. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  671. e1[1] * nx.number_of_nodes(g2) + e2[0])
  672. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  673. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  674. else:
  675. # edge non-synb labeled
  676. if ds_attrs['edge_attr_dim'] > 0:
  677. ke = edge_kernels['nsymb']
  678. for e1 in g1.edges(data=True):
  679. for e2 in g2.edges(data=True):
  680. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  681. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  682. e1[1] * nx.number_of_nodes(g2) + e2[1])
  683. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  684. * ek_temp * vk_dict[(e1[1], e2[1])] \
  685. + vk_dict[(e1[0], e2[1])] \
  686. * ek_temp * vk_dict[(e1[1], e2[0])]
  687. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  688. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  689. e1[1] * nx.number_of_nodes(g2) + e2[0])
  690. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  691. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  692. # edge unlabeled
  693. else:
  694. for e1 in g1.edges(data=True):
  695. for e2 in g2.edges(data=True):
  696. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  697. e1[1] * nx.number_of_nodes(g2) + e2[1])
  698. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  699. * vk_dict[(e1[1], e2[1])] \
  700. + vk_dict[(e1[0], e2[1])] \
  701. * vk_dict[(e1[1], e2[0])]
  702. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  703. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  704. e1[1] * nx.number_of_nodes(g2) + e2[0])
  705. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  706. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  707. else: # node unlabeled
  708. if ds_attrs['is_directed']:
  709. if ds_attrs['edge_labeled']:
  710. # edge symb and non-synb labeled
  711. if ds_attrs['edge_attr_dim'] > 0:
  712. ke = edge_kernels['mix']
  713. for e1 in g1.edges(data=True):
  714. for e2 in g2.edges(data=True):
  715. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  716. e1[2]['attributes'], e2[2]['attributes'])
  717. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  718. e1[1] * nx.number_of_nodes(g2) + e2[1])
  719. w_times[w_idx] = ek_temp
  720. # edge symb labeled
  721. else:
  722. ke = edge_kernels['symb']
  723. for e1 in g1.edges(data=True):
  724. for e2 in g2.edges(data=True):
  725. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  726. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  727. e1[1] * nx.number_of_nodes(g2) + e2[1])
  728. w_times[w_idx] = ek_temp
  729. else:
  730. # edge non-synb labeled
  731. if ds_attrs['edge_attr_dim'] > 0:
  732. ke = edge_kernels['nsymb']
  733. for e1 in g1.edges(data=True):
  734. for e2 in g2.edges(data=True):
  735. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  736. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  737. e1[1] * nx.number_of_nodes(g2) + e2[1])
  738. w_times[w_idx] = ek_temp
  739. # edge unlabeled
  740. else:
  741. for e1 in g1.edges(data=True):
  742. for e2 in g2.edges(data=True):
  743. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  744. e1[1] * nx.number_of_nodes(g2) + e2[1])
  745. w_times[w_idx] = 1
  746. else: # undirected
  747. if ds_attrs['edge_labeled']:
  748. # edge symb and non-synb labeled
  749. if ds_attrs['edge_attr_dim'] > 0:
  750. ke = edge_kernels['mix']
  751. for e1 in g1.edges(data=True):
  752. for e2 in g2.edges(data=True):
  753. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  754. e1[2]['attributes'], e2[2]['attributes'])
  755. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  756. e1[1] * nx.number_of_nodes(g2) + e2[1])
  757. w_times[w_idx] = ek_temp
  758. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  759. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  760. e1[1] * nx.number_of_nodes(g2) + e2[0])
  761. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  762. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  763. # edge symb labeled
  764. else:
  765. ke = edge_kernels['symb']
  766. for e1 in g1.edges(data=True):
  767. for e2 in g2.edges(data=True):
  768. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  769. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  770. e1[1] * nx.number_of_nodes(g2) + e2[1])
  771. w_times[w_idx] = ek_temp
  772. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  773. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  774. e1[1] * nx.number_of_nodes(g2) + e2[0])
  775. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  776. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  777. else:
  778. # edge non-synb labeled
  779. if ds_attrs['edge_attr_dim'] > 0:
  780. ke = edge_kernels['nsymb']
  781. for e1 in g1.edges(data=True):
  782. for e2 in g2.edges(data=True):
  783. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  784. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  785. e1[1] * nx.number_of_nodes(g2) + e2[1])
  786. w_times[w_idx] = ek_temp
  787. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  788. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  789. e1[1] * nx.number_of_nodes(g2) + e2[0])
  790. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  791. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  792. # edge unlabeled
  793. else:
  794. for e1 in g1.edges(data=True):
  795. for e2 in g2.edges(data=True):
  796. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  797. e1[1] * nx.number_of_nodes(g2) + e2[1])
  798. w_times[w_idx] = 1
  799. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  800. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  801. e1[1] * nx.number_of_nodes(g2) + e2[0])
  802. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  803. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  804. return w_times, w_dim

A Python package for graph kernels, graph edit distances and graph pre-image problem.