You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

randomWalkKernel.py 41 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908
  1. """
  2. @author: linlin
  3. @references:
  4. [1] S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
  5. """
  6. import time
  7. from functools import partial
  8. from tqdm import tqdm
  9. import sys
  10. import networkx as nx
  11. import numpy as np
  12. from scipy.sparse import identity, kron
  13. from scipy.sparse.linalg import cg
  14. from scipy.optimize import fixed_point
  15. from gklearn.utils.graphdataset import get_dataset_attributes
  16. from gklearn.utils.parallel import parallel_gm
  17. def randomwalkkernel(*args,
  18. # params for all method.
  19. compute_method=None,
  20. weight=1,
  21. p=None,
  22. q=None,
  23. edge_weight=None,
  24. # params for conjugate and fp method.
  25. node_kernels=None,
  26. edge_kernels=None,
  27. node_label='atom',
  28. edge_label='bond_type',
  29. # params for spectral method.
  30. sub_kernel=None,
  31. n_jobs=None,
  32. verbose=True):
  33. """Calculate random walk graph kernels.
  34. Parameters
  35. ----------
  36. Gn : List of NetworkX graph
  37. List of graphs between which the kernels are calculated.
  38. G1, G2 : NetworkX graphs
  39. Two graphs between which the kernel is calculated.
  40. compute_method : string
  41. Method used to compute kernel. The Following choices are
  42. available:
  43. 'sylvester' - Sylvester equation method.
  44. 'conjugate' - conjugate gradient method.
  45. 'fp' - fixed-point iterations.
  46. 'spectral' - spectral decomposition.
  47. weight : float
  48. A constant weight set for random walks of length h.
  49. p : None
  50. Initial probability distribution on the unlabeled direct product graph
  51. of two graphs. It is set to be uniform over all vertices in the direct
  52. product graph.
  53. q : None
  54. Stopping probability distribution on the unlabeled direct product graph
  55. of two graphs. It is set to be uniform over all vertices in the direct
  56. product graph.
  57. edge_weight : float
  58. Edge attribute name corresponding to the edge weight.
  59. node_kernels: dict
  60. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  61. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  62. for both labels. The first 2 functions take two node labels as
  63. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  64. non-symbolic label for each the two nodes. Each label is in form of 2-D
  65. dimension array (n_samples, n_features). Each function returns a number
  66. as the kernel value. Ignored when nodes are unlabeled. This argument
  67. is designated to conjugate gradient method and fixed-point iterations.
  68. edge_kernels: dict
  69. A dictionary of kernel functions for edges, including 3 items: 'symb'
  70. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  71. for both labels. The first 2 functions take two edge labels as
  72. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  73. non-symbolic label for each the two edges. Each label is in form of 2-D
  74. dimension array (n_samples, n_features). Each function returns a number
  75. as the kernel value. Ignored when edges are unlabeled. This argument
  76. is designated to conjugate gradient method and fixed-point iterations.
  77. node_label: string
  78. Node attribute used as label. The default node label is atom. This
  79. argument is designated to conjugate gradient method and fixed-point
  80. iterations.
  81. edge_label : string
  82. Edge attribute used as label. The default edge label is bond_type. This
  83. argument is designated to conjugate gradient method and fixed-point
  84. iterations.
  85. sub_kernel: string
  86. Method used to compute walk kernel. The Following choices are
  87. available:
  88. 'exp' : method based on exponential serials.
  89. 'geo' : method based on geometric serials.
  90. n_jobs: int
  91. Number of jobs for parallelization.
  92. Return
  93. ------
  94. Kmatrix : Numpy matrix
  95. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  96. """
  97. compute_method = compute_method.lower()
  98. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  99. Gn = [g.copy() for g in Gn]
  100. eweight = None
  101. if edge_weight == None:
  102. if verbose:
  103. print('\n None edge weight specified. Set all weight to 1.\n')
  104. else:
  105. try:
  106. some_weight = list(
  107. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  108. if isinstance(some_weight, float) or isinstance(some_weight, int):
  109. eweight = edge_weight
  110. else:
  111. if verbose:
  112. print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  113. % edge_weight)
  114. except:
  115. if verbose:
  116. print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  117. % edge_weight)
  118. ds_attrs = get_dataset_attributes(
  119. Gn,
  120. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  121. 'edge_attr_dim', 'is_directed'],
  122. node_label=node_label,
  123. edge_label=edge_label)
  124. # remove graphs with no edges, as no walk can be found in their structures,
  125. # so the weight matrix between such a graph and itself might be zero.
  126. len_gn = len(Gn)
  127. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  128. idx = [G[0] for G in Gn]
  129. Gn = [G[1] for G in Gn]
  130. if len(Gn) != len_gn:
  131. if verbose:
  132. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  133. (len_gn - len(Gn)))
  134. start_time = time.time()
  135. # # get vertex and edge concatenated labels for each graph
  136. # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
  137. # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])
  138. if compute_method == 'sylvester':
  139. if verbose:
  140. import warnings
  141. warnings.warn('All labels are ignored.')
  142. Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose)
  143. elif compute_method == 'conjugate':
  144. Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels,
  145. edge_kernels, node_label, edge_label,
  146. eweight, n_jobs, verbose=verbose)
  147. elif compute_method == 'fp':
  148. Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels,
  149. edge_kernels, node_label, edge_label,
  150. eweight, n_jobs, verbose=verbose)
  151. elif compute_method == 'spectral':
  152. if verbose:
  153. import warnings
  154. warnings.warn('All labels are ignored. Only works for undirected graphs.')
  155. Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel,
  156. eweight, n_jobs, verbose=verbose)
  157. elif compute_method == 'kron':
  158. pass
  159. for i in range(0, len(Gn)):
  160. for j in range(i, len(Gn)):
  161. Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
  162. node_label, edge_label)
  163. Kmatrix[j][i] = Kmatrix[i][j]
  164. else:
  165. raise Exception(
  166. 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
  167. )
  168. run_time = time.time() - start_time
  169. if verbose:
  170. print("\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
  171. % (len(Gn), run_time))
  172. return Kmatrix, run_time, idx
  173. ###############################################################################
  174. def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):
  175. """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.
  176. Parameters
  177. ----------
  178. G1, G2 : NetworkX graph
  179. Graphs between which the kernel is calculated.
  180. node_label : string
  181. node attribute used as label.
  182. edge_label : string
  183. edge attribute used as label.
  184. Return
  185. ------
  186. kernel : float
  187. Kernel between 2 graphs.
  188. """
  189. Kmatrix = np.zeros((len(Gn), len(Gn)))
  190. if q == None:
  191. # don't normalize adjacency matrices if q is a uniform vector. Note
  192. # A_wave_list actually contains the transposes of the adjacency matrices.
  193. A_wave_list = [
  194. nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  195. (tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) if
  196. verbose else Gn)
  197. ]
  198. # # normalized adjacency matrices
  199. # A_wave_list = []
  200. # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
  201. # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
  202. # norm = A_tilde.sum(axis=0)
  203. # norm[norm == 0] = 1
  204. # A_wave_list.append(A_tilde / norm)
  205. if p == None: # p is uniform distribution as default.
  206. def init_worker(Awl_toshare):
  207. global G_Awl
  208. G_Awl = Awl_toshare
  209. do_partial = partial(wrapper_se_do, lmda)
  210. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  211. glbv=(A_wave_list,), n_jobs=n_jobs, verbose=verbose)
  212. # pbar = tqdm(
  213. # total=(1 + len(Gn)) * len(Gn) / 2,
  214. # desc='calculating kernels',
  215. # file=sys.stdout)
  216. # for i in range(0, len(Gn)):
  217. # for j in range(i, len(Gn)):
  218. # S = lmda * A_wave_list[j]
  219. # T_t = A_wave_list[i]
  220. # # use uniform distribution if there is no prior knowledge.
  221. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  222. # p_times_uni = 1 / nb_pd
  223. # M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
  224. # X = dlyap(S, T_t, M0)
  225. # X = np.reshape(X, (-1, 1), order='F')
  226. # # use uniform distribution if there is no prior knowledge.
  227. # q_times = np.full((1, nb_pd), p_times_uni)
  228. # Kmatrix[i][j] = np.dot(q_times, X)
  229. # Kmatrix[j][i] = Kmatrix[i][j]
  230. # pbar.update(1)
  231. return Kmatrix
  232. def wrapper_se_do(lmda, itr):
  233. i = itr[0]
  234. j = itr[1]
  235. return i, j, _se_do(G_Awl[i], G_Awl[j], lmda)
  236. def _se_do(A_wave1, A_wave2, lmda):
  237. from control import dlyap
  238. S = lmda * A_wave2
  239. T_t = A_wave1
  240. # use uniform distribution if there is no prior knowledge.
  241. nb_pd = len(A_wave1) * len(A_wave2)
  242. p_times_uni = 1 / nb_pd
  243. M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni)
  244. X = dlyap(S, T_t, M0)
  245. X = np.reshape(X, (-1, 1), order='F')
  246. # use uniform distribution if there is no prior knowledge.
  247. q_times = np.full((1, nb_pd), p_times_uni)
  248. return np.dot(q_times, X)
  249. ###############################################################################
  250. def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  251. node_label, edge_label, eweight, n_jobs, verbose=True):
  252. """Calculate walk graph kernels up to n between 2 graphs using conjugate method.
  253. Parameters
  254. ----------
  255. G1, G2 : NetworkX graph
  256. Graphs between which the kernel is calculated.
  257. node_label : string
  258. node attribute used as label.
  259. edge_label : string
  260. edge attribute used as label.
  261. Return
  262. ------
  263. kernel : float
  264. Kernel between 2 graphs.
  265. """
  266. Kmatrix = np.zeros((len(Gn), len(Gn)))
  267. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  268. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
  269. # # this is faster from unlabeled graphs. @todo: why?
  270. # if q == None:
  271. # # don't normalize adjacency matrices if q is a uniform vector. Note
  272. # # A_wave_list actually contains the transposes of the adjacency matrices.
  273. # A_wave_list = [
  274. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  275. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  276. # ]
  277. # if p == None: # p is uniform distribution as default.
  278. # def init_worker(Awl_toshare):
  279. # global G_Awl
  280. # G_Awl = Awl_toshare
  281. # do_partial = partial(wrapper_cg_unlabled_do, lmda)
  282. # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  283. # glbv=(A_wave_list,), n_jobs=n_jobs)
  284. # else:
  285. # reindex nodes using consecutive integers for convenience of kernel calculation.
  286. Gn = [nx.convert_node_labels_to_integers(
  287. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  288. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  289. if p == None and q == None: # p and q are uniform distributions as default.
  290. def init_worker(gn_toshare):
  291. global G_gn
  292. G_gn = gn_toshare
  293. do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
  294. node_label, edge_kernels, edge_label, lmda)
  295. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  296. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  297. # pbar = tqdm(
  298. # total=(1 + len(Gn)) * len(Gn) / 2,
  299. # desc='calculating kernels',
  300. # file=sys.stdout)
  301. # for i in range(0, len(Gn)):
  302. # for j in range(i, len(Gn)):
  303. # result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
  304. # node_label, edge_kernels, edge_label, lmda)
  305. # Kmatrix[i][j] = result
  306. # Kmatrix[j][i] = Kmatrix[i][j]
  307. # pbar.update(1)
  308. return Kmatrix
  309. def wrapper_cg_unlabled_do(lmda, itr):
  310. i = itr[0]
  311. j = itr[1]
  312. return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda)
  313. def _cg_unlabled_do(A_wave1, A_wave2, lmda):
  314. nb_pd = len(A_wave1) * len(A_wave2)
  315. p_times_uni = 1 / nb_pd
  316. w_times = kron(A_wave1, A_wave2).todense()
  317. A = identity(w_times.shape[0]) - w_times * lmda
  318. b = np.full((nb_pd, 1), p_times_uni)
  319. x, _ = cg(A, b)
  320. # use uniform distribution if there is no prior knowledge.
  321. q_times = np.full((1, nb_pd), p_times_uni)
  322. return np.dot(q_times, x)
  323. def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  324. edge_label, lmda, itr):
  325. i = itr[0]
  326. j = itr[1]
  327. return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  328. node_label, edge_kernels, edge_label, lmda)
  329. def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  330. edge_kernels, edge_label, lmda):
  331. # Frist, compute kernels between all pairs of nodes, method borrowed
  332. # from FCSP. It is faster than directly computing all edge kernels
  333. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  334. # graphs compared, which is the most case we went though. For very
  335. # sparse graphs, this would be slow.
  336. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  337. # Compute weight matrix of the direct product graph.
  338. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  339. edge_kernels, edge_label)
  340. # use uniform distribution if there is no prior knowledge.
  341. p_times_uni = 1 / w_dim
  342. A = identity(w_times.shape[0]) - w_times * lmda
  343. b = np.full((w_dim, 1), p_times_uni)
  344. x, _ = cg(A, b)
  345. # use uniform distribution if there is no prior knowledge.
  346. q_times = np.full((1, w_dim), p_times_uni)
  347. return np.dot(q_times, x)
  348. ###############################################################################
  349. def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  350. node_label, edge_label, eweight, n_jobs, verbose=True):
  351. """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.
  352. Parameters
  353. ----------
  354. G1, G2 : NetworkX graph
  355. Graphs between which the kernel is calculated.
  356. node_label : string
  357. node attribute used as label.
  358. edge_label : string
  359. edge attribute used as label.
  360. Return
  361. ------
  362. kernel : float
  363. Kernel between 2 graphs.
  364. """
  365. Kmatrix = np.zeros((len(Gn), len(Gn)))
  366. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  367. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
  368. # # this is faster from unlabeled graphs. @todo: why?
  369. # if q == None:
  370. # # don't normalize adjacency matrices if q is a uniform vector. Note
  371. # # A_wave_list actually contains the transposes of the adjacency matrices.
  372. # A_wave_list = [
  373. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  374. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  375. # ]
  376. # if p == None: # p is uniform distribution as default.
  377. # pbar = tqdm(
  378. # total=(1 + len(Gn)) * len(Gn) / 2,
  379. # desc='calculating kernels',
  380. # file=sys.stdout)
  381. # for i in range(0, len(Gn)):
  382. # for j in range(i, len(Gn)):
  383. # # use uniform distribution if there is no prior knowledge.
  384. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  385. # p_times_uni = 1 / nb_pd
  386. # w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
  387. # p_times = np.full((nb_pd, 1), p_times_uni)
  388. # x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
  389. # # use uniform distribution if there is no prior knowledge.
  390. # q_times = np.full((1, nb_pd), p_times_uni)
  391. # Kmatrix[i][j] = np.dot(q_times, x)
  392. # Kmatrix[j][i] = Kmatrix[i][j]
  393. # pbar.update(1)
  394. # else:
  395. # reindex nodes using consecutive integers for convenience of kernel calculation.
  396. Gn = [nx.convert_node_labels_to_integers(
  397. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  398. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  399. if p == None and q == None: # p and q are uniform distributions as default.
  400. def init_worker(gn_toshare):
  401. global G_gn
  402. G_gn = gn_toshare
  403. do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
  404. node_label, edge_kernels, edge_label, lmda)
  405. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  406. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  407. return Kmatrix
  408. def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  409. edge_label, lmda, itr):
  410. i = itr[0]
  411. j = itr[1]
  412. return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  413. node_label, edge_kernels, edge_label, lmda)
  414. def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  415. edge_kernels, edge_label, lmda):
  416. # Frist, compute kernels between all pairs of nodes, method borrowed
  417. # from FCSP. It is faster than directly computing all edge kernels
  418. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  419. # graphs compared, which is the most case we went though. For very
  420. # sparse graphs, this would be slow.
  421. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  422. # Compute weight matrix of the direct product graph.
  423. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  424. edge_kernels, edge_label)
  425. # use uniform distribution if there is no prior knowledge.
  426. p_times_uni = 1 / w_dim
  427. p_times = np.full((w_dim, 1), p_times_uni)
  428. x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times),
  429. xtol=1e-06, maxiter=1000)
  430. # use uniform distribution if there is no prior knowledge.
  431. q_times = np.full((1, w_dim), p_times_uni)
  432. return np.dot(q_times, x)
  433. def func_fp(x, p_times, lmda, w_times):
  434. haha = w_times * x
  435. haha = lmda * haha
  436. haha = p_times + haha
  437. return p_times + lmda * np.dot(w_times, x)
  438. ###############################################################################
  439. def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True):
  440. """Calculate walk graph kernels up to n between 2 unlabeled graphs using
  441. spectral decomposition method. Labels will be ignored.
  442. Parameters
  443. ----------
  444. G1, G2 : NetworkX graph
  445. Graphs between which the kernel is calculated.
  446. node_label : string
  447. node attribute used as label.
  448. edge_label : string
  449. edge attribute used as label.
  450. Return
  451. ------
  452. kernel : float
  453. Kernel between 2 graphs.
  454. """
  455. Kmatrix = np.zeros((len(Gn), len(Gn)))
  456. if q == None:
  457. # precompute the spectral decomposition of each graph.
  458. P_list = []
  459. D_list = []
  460. for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if
  461. verbose else Gn):
  462. # don't normalize adjacency matrices if q is a uniform vector. Note
  463. # A actually is the transpose of the adjacency matrix.
  464. A = nx.adjacency_matrix(G, eweight).todense().transpose()
  465. ew, ev = np.linalg.eig(A)
  466. D_list.append(ew)
  467. P_list.append(ev)
  468. # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?
  469. if p == None: # p is uniform distribution as default.
  470. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn]
  471. # q_T_list = [q.T for q in q_list]
  472. def init_worker(q_T_toshare, P_toshare, D_toshare):
  473. global G_q_T, G_P, G_D
  474. G_q_T = q_T_toshare
  475. G_P = P_toshare
  476. G_D = D_toshare
  477. do_partial = partial(wrapper_sd_do, weight, sub_kernel)
  478. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  479. glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs,
  480. verbose=verbose)
  481. # pbar = tqdm(
  482. # total=(1 + len(Gn)) * len(Gn) / 2,
  483. # desc='calculating kernels',
  484. # file=sys.stdout)
  485. # for i in range(0, len(Gn)):
  486. # for j in range(i, len(Gn)):
  487. # result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
  488. # D_list[i], D_list[j], weight, sub_kernel)
  489. # Kmatrix[i][j] = result
  490. # Kmatrix[j][i] = Kmatrix[i][j]
  491. # pbar.update(1)
  492. return Kmatrix
  493. def wrapper_sd_do(weight, sub_kernel, itr):
  494. i = itr[0]
  495. j = itr[1]
  496. return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j],
  497. weight, sub_kernel)
  498. def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):
  499. # use uniform distribution if there is no prior knowledge.
  500. kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
  501. # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs
  502. # kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense()
  503. if sub_kernel == 'exp':
  504. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  505. kmiddle = np.diag(np.exp(weight * D_diag))
  506. elif sub_kernel == 'geo':
  507. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  508. kmiddle = np.diag(weight * D_diag)
  509. kmiddle = np.identity(len(kmiddle)) - weight * kmiddle
  510. kmiddle = np.linalg.inv(kmiddle)
  511. return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]
  512. ###############################################################################
  513. def _randomwalkkernel_kron(G1, G2, node_label, edge_label):
  514. """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method.
  515. Parameters
  516. ----------
  517. G1, G2 : NetworkX graph
  518. Graphs between which the kernel is calculated.
  519. node_label : string
  520. node attribute used as label.
  521. edge_label : string
  522. edge attribute used as label.
  523. Return
  524. ------
  525. kernel : float
  526. Kernel between 2 graphs.
  527. """
  528. pass
  529. ###############################################################################
  530. def getLabels(Gn, node_label, edge_label, directed):
  531. """Get symbolic labels of a graph dataset, where vertex labels are dealt
  532. with by concatenating them to the edge labels of adjacent edges.
  533. """
  534. label_list = []
  535. label_set = set()
  536. for g in Gn:
  537. label_g = {}
  538. for e in g.edges(data=True):
  539. nl1 = g.node[e[0]][node_label]
  540. nl2 = g.node[e[1]][node_label]
  541. if not directed and nl1 > nl2:
  542. nl1, nl2 = nl2, nl1
  543. label = (nl1, e[2][edge_label], nl2)
  544. label_g[(e[0], e[1])] = label
  545. label_list.append(label_g)
  546. label_set = set([l for lg in label_list for l in lg.values()])
  547. return label_list, len(label_set)
  548. def filterGramMatrix(gmt, label_dict, label, directed):
  549. """Compute (the transpose of) the Gram matrix filtered by a label.
  550. """
  551. gmf = np.zeros(gmt.shape)
  552. for (n1, n2), l in label_dict.items():
  553. if l == label:
  554. gmf[n2, n1] = gmt[n2, n1]
  555. if not directed:
  556. gmf[n1, n2] = gmt[n1, n2]
  557. return gmf
  558. def computeVK(g1, g2, ds_attrs, node_kernels, node_label):
  559. '''Compute vertex kernels between vertices of two graphs.
  560. '''
  561. vk_dict = {} # shortest path matrices dict
  562. if ds_attrs['node_labeled']:
  563. # node symb and non-synb labeled
  564. if ds_attrs['node_attr_dim'] > 0:
  565. kn = node_kernels['mix']
  566. for n1 in g1.nodes(data=True):
  567. for n2 in g2.nodes(data=True):
  568. vk_dict[(n1[0], n2[0])] = kn(
  569. n1[1][node_label], n2[1][node_label],
  570. n1[1]['attributes'], n2[1]['attributes'])
  571. # node symb labeled
  572. else:
  573. kn = node_kernels['symb']
  574. for n1 in g1.nodes(data=True):
  575. for n2 in g2.nodes(data=True):
  576. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  577. n2[1][node_label])
  578. else:
  579. # node non-synb labeled
  580. if ds_attrs['node_attr_dim'] > 0:
  581. kn = node_kernels['nsymb']
  582. for n1 in g1.nodes(data=True):
  583. for n2 in g2.nodes(data=True):
  584. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  585. n2[1]['attributes'])
  586. # node unlabeled
  587. else:
  588. pass
  589. return vk_dict
  590. def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):
  591. '''Compute weight matrix of the direct product graph.
  592. '''
  593. w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
  594. w_times = np.zeros((w_dim, w_dim))
  595. if vk_dict: # node labeled
  596. if ds_attrs['is_directed']:
  597. if ds_attrs['edge_labeled']:
  598. # edge symb and non-synb labeled
  599. if ds_attrs['edge_attr_dim'] > 0:
  600. ke = edge_kernels['mix']
  601. for e1 in g1.edges(data=True):
  602. for e2 in g2.edges(data=True):
  603. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  604. e1[2]['attributes'], e2[2]['attributes'])
  605. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  606. e1[1] * nx.number_of_nodes(g2) + e2[1])
  607. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  608. * ek_temp * vk_dict[(e1[1], e2[1])]
  609. # edge symb labeled
  610. else:
  611. ke = edge_kernels['symb']
  612. for e1 in g1.edges(data=True):
  613. for e2 in g2.edges(data=True):
  614. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  615. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  616. e1[1] * nx.number_of_nodes(g2) + e2[1])
  617. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  618. * ek_temp * vk_dict[(e1[1], e2[1])]
  619. else:
  620. # edge non-synb labeled
  621. if ds_attrs['edge_attr_dim'] > 0:
  622. ke = edge_kernels['nsymb']
  623. for e1 in g1.edges(data=True):
  624. for e2 in g2.edges(data=True):
  625. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  626. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  627. e1[1] * nx.number_of_nodes(g2) + e2[1])
  628. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  629. * ek_temp * vk_dict[(e1[1], e2[1])]
  630. # edge unlabeled
  631. else:
  632. for e1 in g1.edges(data=True):
  633. for e2 in g2.edges(data=True):
  634. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  635. e1[1] * nx.number_of_nodes(g2) + e2[1])
  636. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  637. * vk_dict[(e1[1], e2[1])]
  638. else: # undirected
  639. if ds_attrs['edge_labeled']:
  640. # edge symb and non-synb labeled
  641. if ds_attrs['edge_attr_dim'] > 0:
  642. ke = edge_kernels['mix']
  643. for e1 in g1.edges(data=True):
  644. for e2 in g2.edges(data=True):
  645. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  646. e1[2]['attributes'], e2[2]['attributes'])
  647. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  648. e1[1] * nx.number_of_nodes(g2) + e2[1])
  649. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  650. * ek_temp * vk_dict[(e1[1], e2[1])] \
  651. + vk_dict[(e1[0], e2[1])] \
  652. * ek_temp * vk_dict[(e1[1], e2[0])]
  653. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  654. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  655. e1[1] * nx.number_of_nodes(g2) + e2[0])
  656. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  657. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  658. # edge symb labeled
  659. else:
  660. ke = edge_kernels['symb']
  661. for e1 in g1.edges(data=True):
  662. for e2 in g2.edges(data=True):
  663. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  664. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  665. e1[1] * nx.number_of_nodes(g2) + e2[1])
  666. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  667. * ek_temp * vk_dict[(e1[1], e2[1])] \
  668. + vk_dict[(e1[0], e2[1])] \
  669. * ek_temp * vk_dict[(e1[1], e2[0])]
  670. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  671. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  672. e1[1] * nx.number_of_nodes(g2) + e2[0])
  673. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  674. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  675. else:
  676. # edge non-synb labeled
  677. if ds_attrs['edge_attr_dim'] > 0:
  678. ke = edge_kernels['nsymb']
  679. for e1 in g1.edges(data=True):
  680. for e2 in g2.edges(data=True):
  681. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  682. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  683. e1[1] * nx.number_of_nodes(g2) + e2[1])
  684. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  685. * ek_temp * vk_dict[(e1[1], e2[1])] \
  686. + vk_dict[(e1[0], e2[1])] \
  687. * ek_temp * vk_dict[(e1[1], e2[0])]
  688. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  689. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  690. e1[1] * nx.number_of_nodes(g2) + e2[0])
  691. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  692. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  693. # edge unlabeled
  694. else:
  695. for e1 in g1.edges(data=True):
  696. for e2 in g2.edges(data=True):
  697. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  698. e1[1] * nx.number_of_nodes(g2) + e2[1])
  699. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  700. * vk_dict[(e1[1], e2[1])] \
  701. + vk_dict[(e1[0], e2[1])] \
  702. * vk_dict[(e1[1], e2[0])]
  703. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  704. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  705. e1[1] * nx.number_of_nodes(g2) + e2[0])
  706. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  707. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  708. else: # node unlabeled
  709. if ds_attrs['is_directed']:
  710. if ds_attrs['edge_labeled']:
  711. # edge symb and non-synb labeled
  712. if ds_attrs['edge_attr_dim'] > 0:
  713. ke = edge_kernels['mix']
  714. for e1 in g1.edges(data=True):
  715. for e2 in g2.edges(data=True):
  716. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  717. e1[2]['attributes'], e2[2]['attributes'])
  718. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  719. e1[1] * nx.number_of_nodes(g2) + e2[1])
  720. w_times[w_idx] = ek_temp
  721. # edge symb labeled
  722. else:
  723. ke = edge_kernels['symb']
  724. for e1 in g1.edges(data=True):
  725. for e2 in g2.edges(data=True):
  726. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  727. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  728. e1[1] * nx.number_of_nodes(g2) + e2[1])
  729. w_times[w_idx] = ek_temp
  730. else:
  731. # edge non-synb labeled
  732. if ds_attrs['edge_attr_dim'] > 0:
  733. ke = edge_kernels['nsymb']
  734. for e1 in g1.edges(data=True):
  735. for e2 in g2.edges(data=True):
  736. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  737. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  738. e1[1] * nx.number_of_nodes(g2) + e2[1])
  739. w_times[w_idx] = ek_temp
  740. # edge unlabeled
  741. else:
  742. for e1 in g1.edges(data=True):
  743. for e2 in g2.edges(data=True):
  744. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  745. e1[1] * nx.number_of_nodes(g2) + e2[1])
  746. w_times[w_idx] = 1
  747. else: # undirected
  748. if ds_attrs['edge_labeled']:
  749. # edge symb and non-synb labeled
  750. if ds_attrs['edge_attr_dim'] > 0:
  751. ke = edge_kernels['mix']
  752. for e1 in g1.edges(data=True):
  753. for e2 in g2.edges(data=True):
  754. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  755. e1[2]['attributes'], e2[2]['attributes'])
  756. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  757. e1[1] * nx.number_of_nodes(g2) + e2[1])
  758. w_times[w_idx] = ek_temp
  759. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  760. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  761. e1[1] * nx.number_of_nodes(g2) + e2[0])
  762. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  763. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  764. # edge symb labeled
  765. else:
  766. ke = edge_kernels['symb']
  767. for e1 in g1.edges(data=True):
  768. for e2 in g2.edges(data=True):
  769. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  770. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  771. e1[1] * nx.number_of_nodes(g2) + e2[1])
  772. w_times[w_idx] = ek_temp
  773. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  774. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  775. e1[1] * nx.number_of_nodes(g2) + e2[0])
  776. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  777. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  778. else:
  779. # edge non-synb labeled
  780. if ds_attrs['edge_attr_dim'] > 0:
  781. ke = edge_kernels['nsymb']
  782. for e1 in g1.edges(data=True):
  783. for e2 in g2.edges(data=True):
  784. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  785. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  786. e1[1] * nx.number_of_nodes(g2) + e2[1])
  787. w_times[w_idx] = ek_temp
  788. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  789. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  790. e1[1] * nx.number_of_nodes(g2) + e2[0])
  791. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  792. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  793. # edge unlabeled
  794. else:
  795. for e1 in g1.edges(data=True):
  796. for e2 in g2.edges(data=True):
  797. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  798. e1[1] * nx.number_of_nodes(g2) + e2[1])
  799. w_times[w_idx] = 1
  800. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  801. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  802. e1[1] * nx.number_of_nodes(g2) + e2[0])
  803. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  804. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  805. return w_times, w_dim

A Python package for graph kernels, graph edit distances and graph pre-image problem.