You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

randomWalkKernel.py 41 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. """
  2. @author: linlin
  3. @references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
  4. """
  5. import sys
  6. sys.path.insert(0, "../")
  7. import time
  8. from functools import partial
  9. from tqdm import tqdm
  10. import networkx as nx
  11. import numpy as np
  12. from scipy.sparse import identity, kron
  13. from scipy.sparse.linalg import cg
  14. from scipy.optimize import fixed_point
  15. from pygraph.utils.graphdataset import get_dataset_attributes
  16. from pygraph.utils.parallel import parallel_gm
  17. def randomwalkkernel(*args,
  18. # params for all method.
  19. compute_method=None,
  20. weight=1,
  21. p=None,
  22. q=None,
  23. edge_weight=None,
  24. # params for conjugate and fp method.
  25. node_kernels=None,
  26. edge_kernels=None,
  27. node_label='atom',
  28. edge_label='bond_type',
  29. # params for spectral method.
  30. sub_kernel=None,
  31. n_jobs=None,
  32. verbose=True):
  33. """Calculate random walk graph kernels.
  34. Parameters
  35. ----------
  36. Gn : List of NetworkX graph
  37. List of graphs between which the kernels are calculated.
  38. /
  39. G1, G2 : NetworkX graphs
  40. Two graphs between which the kernel is calculated.
  41. compute_method : string
  42. Method used to compute kernel. The Following choices are
  43. available:
  44. 'sylvester' - Sylvester equation method.
  45. 'conjugate' - conjugate gradient method.
  46. 'fp' - fixed-point iterations.
  47. 'spectral' - spectral decomposition.
  48. weight : float
  49. A constant weight set for random walks of length h.
  50. p : None
  51. Initial probability distribution on the unlabeled direct product graph
  52. of two graphs. It is set to be uniform over all vertices in the direct
  53. product graph.
  54. q : None
  55. Stopping probability distribution on the unlabeled direct product graph
  56. of two graphs. It is set to be uniform over all vertices in the direct
  57. product graph.
  58. edge_weight: float
  59. Edge attribute name corresponding to the edge weight.
  60. node_kernels: dict
  61. A dictionary of kernel functions for nodes, including 3 items: 'symb'
  62. for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
  63. for both labels. The first 2 functions take two node labels as
  64. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  65. non-symbolic label for each the two nodes. Each label is in form of 2-D
  66. dimension array (n_samples, n_features). Each function returns a number
  67. as the kernel value. Ignored when nodes are unlabeled. This argument
  68. is designated to conjugate gradient method and fixed-point iterations.
  69. edge_kernels: dict
  70. A dictionary of kernel functions for edges, including 3 items: 'symb'
  71. for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
  72. for both labels. The first 2 functions take two edge labels as
  73. parameters, and the 'mix' function takes 4 parameters, a symbolic and a
  74. non-symbolic label for each the two edges. Each label is in form of 2-D
  75. dimension array (n_samples, n_features). Each function returns a number
  76. as the kernel value. Ignored when edges are unlabeled. This argument
  77. is designated to conjugate gradient method and fixed-point iterations.
  78. node_label: string
  79. Node attribute used as label. The default node label is atom. This
  80. argument is designated to conjugate gradient method and fixed-point
  81. iterations.
  82. edge_label : string
  83. Edge attribute used as label. The default edge label is bond_type. This
  84. argument is designated to conjugate gradient method and fixed-point
  85. iterations.
  86. sub_kernel: string
  87. Method used to compute walk kernel. The Following choices are
  88. available:
  89. 'exp' : method based on exponential serials.
  90. 'geo' : method based on geometric serials.
  91. n_jobs: int
  92. Number of jobs for parallelization.
  93. Return
  94. ------
  95. Kmatrix : Numpy matrix
  96. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  97. """
  98. compute_method = compute_method.lower()
  99. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  100. Gn = [g.copy() for g in Gn]
  101. eweight = None
  102. if edge_weight == None:
  103. if verbose:
  104. print('\n None edge weight specified. Set all weight to 1.\n')
  105. else:
  106. try:
  107. some_weight = list(
  108. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  109. if isinstance(some_weight, float) or isinstance(some_weight, int):
  110. eweight = edge_weight
  111. else:
  112. if verbose:
  113. print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  114. % edge_weight)
  115. except:
  116. if verbose:
  117. print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  118. % edge_weight)
  119. ds_attrs = get_dataset_attributes(
  120. Gn,
  121. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  122. 'edge_attr_dim', 'is_directed'],
  123. node_label=node_label,
  124. edge_label=edge_label)
  125. # remove graphs with no edges, as no walk can be found in their structures,
  126. # so the weight matrix between such a graph and itself might be zero.
  127. len_gn = len(Gn)
  128. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  129. idx = [G[0] for G in Gn]
  130. Gn = [G[1] for G in Gn]
  131. if len(Gn) != len_gn:
  132. if verbose:
  133. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  134. (len_gn - len(Gn)))
  135. start_time = time.time()
  136. # # get vertex and edge concatenated labels for each graph
  137. # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
  138. # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])
  139. if compute_method == 'sylvester':
  140. if verbose:
  141. import warnings
  142. warnings.warn('All labels are ignored.')
  143. Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose)
  144. elif compute_method == 'conjugate':
  145. Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels,
  146. edge_kernels, node_label, edge_label,
  147. eweight, n_jobs, verbose=verbose)
  148. elif compute_method == 'fp':
  149. Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels,
  150. edge_kernels, node_label, edge_label,
  151. eweight, n_jobs, verbose=verbose)
  152. elif compute_method == 'spectral':
  153. if verbose:
  154. import warnings
  155. warnings.warn('All labels are ignored. Only works for undirected graphs.')
  156. Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel,
  157. eweight, n_jobs, verbose=verbose)
  158. elif compute_method == 'kron':
  159. pass
  160. for i in range(0, len(Gn)):
  161. for j in range(i, len(Gn)):
  162. Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
  163. node_label, edge_label)
  164. Kmatrix[j][i] = Kmatrix[i][j]
  165. else:
  166. raise Exception(
  167. 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
  168. )
  169. run_time = time.time() - start_time
  170. if verbose:
  171. print("\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
  172. % (len(Gn), run_time))
  173. return Kmatrix, run_time, idx
  174. ###############################################################################
  175. def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):
  176. """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.
  177. Parameters
  178. ----------
  179. G1, G2 : NetworkX graph
  180. Graphs between which the kernel is calculated.
  181. node_label : string
  182. node attribute used as label.
  183. edge_label : string
  184. edge attribute used as label.
  185. Return
  186. ------
  187. kernel : float
  188. Kernel between 2 graphs.
  189. """
  190. Kmatrix = np.zeros((len(Gn), len(Gn)))
  191. if q == None:
  192. # don't normalize adjacency matrices if q is a uniform vector. Note
  193. # A_wave_list actually contains the transposes of the adjacency matrices.
  194. A_wave_list = [
  195. nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  196. (tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) if
  197. verbose else Gn)
  198. ]
  199. # # normalized adjacency matrices
  200. # A_wave_list = []
  201. # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
  202. # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
  203. # norm = A_tilde.sum(axis=0)
  204. # norm[norm == 0] = 1
  205. # A_wave_list.append(A_tilde / norm)
  206. if p == None: # p is uniform distribution as default.
  207. def init_worker(Awl_toshare):
  208. global G_Awl
  209. G_Awl = Awl_toshare
  210. do_partial = partial(wrapper_se_do, lmda)
  211. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  212. glbv=(A_wave_list,), n_jobs=n_jobs, verbose=verbose)
  213. # pbar = tqdm(
  214. # total=(1 + len(Gn)) * len(Gn) / 2,
  215. # desc='calculating kernels',
  216. # file=sys.stdout)
  217. # for i in range(0, len(Gn)):
  218. # for j in range(i, len(Gn)):
  219. # S = lmda * A_wave_list[j]
  220. # T_t = A_wave_list[i]
  221. # # use uniform distribution if there is no prior knowledge.
  222. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  223. # p_times_uni = 1 / nb_pd
  224. # M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
  225. # X = dlyap(S, T_t, M0)
  226. # X = np.reshape(X, (-1, 1), order='F')
  227. # # use uniform distribution if there is no prior knowledge.
  228. # q_times = np.full((1, nb_pd), p_times_uni)
  229. # Kmatrix[i][j] = np.dot(q_times, X)
  230. # Kmatrix[j][i] = Kmatrix[i][j]
  231. # pbar.update(1)
  232. return Kmatrix
  233. def wrapper_se_do(lmda, itr):
  234. i = itr[0]
  235. j = itr[1]
  236. return i, j, _se_do(G_Awl[i], G_Awl[j], lmda)
  237. def _se_do(A_wave1, A_wave2, lmda):
  238. from control import dlyap
  239. S = lmda * A_wave2
  240. T_t = A_wave1
  241. # use uniform distribution if there is no prior knowledge.
  242. nb_pd = len(A_wave1) * len(A_wave2)
  243. p_times_uni = 1 / nb_pd
  244. M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni)
  245. X = dlyap(S, T_t, M0)
  246. X = np.reshape(X, (-1, 1), order='F')
  247. # use uniform distribution if there is no prior knowledge.
  248. q_times = np.full((1, nb_pd), p_times_uni)
  249. return np.dot(q_times, X)
  250. ###############################################################################
  251. def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  252. node_label, edge_label, eweight, n_jobs, verbose=True):
  253. """Calculate walk graph kernels up to n between 2 graphs using conjugate method.
  254. Parameters
  255. ----------
  256. G1, G2 : NetworkX graph
  257. Graphs between which the kernel is calculated.
  258. node_label : string
  259. node attribute used as label.
  260. edge_label : string
  261. edge attribute used as label.
  262. Return
  263. ------
  264. kernel : float
  265. Kernel between 2 graphs.
  266. """
  267. Kmatrix = np.zeros((len(Gn), len(Gn)))
  268. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  269. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
  270. # # this is faster from unlabeled graphs. @todo: why?
  271. # if q == None:
  272. # # don't normalize adjacency matrices if q is a uniform vector. Note
  273. # # A_wave_list actually contains the transposes of the adjacency matrices.
  274. # A_wave_list = [
  275. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  276. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  277. # ]
  278. # if p == None: # p is uniform distribution as default.
  279. # def init_worker(Awl_toshare):
  280. # global G_Awl
  281. # G_Awl = Awl_toshare
  282. # do_partial = partial(wrapper_cg_unlabled_do, lmda)
  283. # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  284. # glbv=(A_wave_list,), n_jobs=n_jobs)
  285. # else:
  286. # reindex nodes using consecutive integers for convenience of kernel calculation.
  287. Gn = [nx.convert_node_labels_to_integers(
  288. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  289. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  290. if p == None and q == None: # p and q are uniform distributions as default.
  291. def init_worker(gn_toshare):
  292. global G_gn
  293. G_gn = gn_toshare
  294. do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
  295. node_label, edge_kernels, edge_label, lmda)
  296. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  297. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  298. # pbar = tqdm(
  299. # total=(1 + len(Gn)) * len(Gn) / 2,
  300. # desc='calculating kernels',
  301. # file=sys.stdout)
  302. # for i in range(0, len(Gn)):
  303. # for j in range(i, len(Gn)):
  304. # result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
  305. # node_label, edge_kernels, edge_label, lmda)
  306. # Kmatrix[i][j] = result
  307. # Kmatrix[j][i] = Kmatrix[i][j]
  308. # pbar.update(1)
  309. return Kmatrix
  310. def wrapper_cg_unlabled_do(lmda, itr):
  311. i = itr[0]
  312. j = itr[1]
  313. return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda)
  314. def _cg_unlabled_do(A_wave1, A_wave2, lmda):
  315. nb_pd = len(A_wave1) * len(A_wave2)
  316. p_times_uni = 1 / nb_pd
  317. w_times = kron(A_wave1, A_wave2).todense()
  318. A = identity(w_times.shape[0]) - w_times * lmda
  319. b = np.full((nb_pd, 1), p_times_uni)
  320. x, _ = cg(A, b)
  321. # use uniform distribution if there is no prior knowledge.
  322. q_times = np.full((1, nb_pd), p_times_uni)
  323. return np.dot(q_times, x)
  324. def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  325. edge_label, lmda, itr):
  326. i = itr[0]
  327. j = itr[1]
  328. return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  329. node_label, edge_kernels, edge_label, lmda)
  330. def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  331. edge_kernels, edge_label, lmda):
  332. # Frist, compute kernels between all pairs of nodes, method borrowed
  333. # from FCSP. It is faster than directly computing all edge kernels
  334. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  335. # graphs compared, which is the most case we went though. For very
  336. # sparse graphs, this would be slow.
  337. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  338. # Compute weight matrix of the direct product graph.
  339. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  340. edge_kernels, edge_label)
  341. # use uniform distribution if there is no prior knowledge.
  342. p_times_uni = 1 / w_dim
  343. A = identity(w_times.shape[0]) - w_times * lmda
  344. b = np.full((w_dim, 1), p_times_uni)
  345. x, _ = cg(A, b)
  346. # use uniform distribution if there is no prior knowledge.
  347. q_times = np.full((1, w_dim), p_times_uni)
  348. return np.dot(q_times, x)
  349. ###############################################################################
  350. def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  351. node_label, edge_label, eweight, n_jobs, verbose=True):
  352. """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.
  353. Parameters
  354. ----------
  355. G1, G2 : NetworkX graph
  356. Graphs between which the kernel is calculated.
  357. node_label : string
  358. node attribute used as label.
  359. edge_label : string
  360. edge attribute used as label.
  361. Return
  362. ------
  363. kernel : float
  364. Kernel between 2 graphs.
  365. """
  366. Kmatrix = np.zeros((len(Gn), len(Gn)))
  367. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  368. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
  369. # # this is faster from unlabeled graphs. @todo: why?
  370. # if q == None:
  371. # # don't normalize adjacency matrices if q is a uniform vector. Note
  372. # # A_wave_list actually contains the transposes of the adjacency matrices.
  373. # A_wave_list = [
  374. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  375. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  376. # ]
  377. # if p == None: # p is uniform distribution as default.
  378. # pbar = tqdm(
  379. # total=(1 + len(Gn)) * len(Gn) / 2,
  380. # desc='calculating kernels',
  381. # file=sys.stdout)
  382. # for i in range(0, len(Gn)):
  383. # for j in range(i, len(Gn)):
  384. # # use uniform distribution if there is no prior knowledge.
  385. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  386. # p_times_uni = 1 / nb_pd
  387. # w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
  388. # p_times = np.full((nb_pd, 1), p_times_uni)
  389. # x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
  390. # # use uniform distribution if there is no prior knowledge.
  391. # q_times = np.full((1, nb_pd), p_times_uni)
  392. # Kmatrix[i][j] = np.dot(q_times, x)
  393. # Kmatrix[j][i] = Kmatrix[i][j]
  394. # pbar.update(1)
  395. # else:
  396. # reindex nodes using consecutive integers for convenience of kernel calculation.
  397. Gn = [nx.convert_node_labels_to_integers(
  398. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  399. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  400. if p == None and q == None: # p and q are uniform distributions as default.
  401. def init_worker(gn_toshare):
  402. global G_gn
  403. G_gn = gn_toshare
  404. do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
  405. node_label, edge_kernels, edge_label, lmda)
  406. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  407. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  408. return Kmatrix
  409. def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  410. edge_label, lmda, itr):
  411. i = itr[0]
  412. j = itr[1]
  413. return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  414. node_label, edge_kernels, edge_label, lmda)
  415. def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  416. edge_kernels, edge_label, lmda):
  417. # Frist, compute kernels between all pairs of nodes, method borrowed
  418. # from FCSP. It is faster than directly computing all edge kernels
  419. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  420. # graphs compared, which is the most case we went though. For very
  421. # sparse graphs, this would be slow.
  422. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  423. # Compute weight matrix of the direct product graph.
  424. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  425. edge_kernels, edge_label)
  426. # use uniform distribution if there is no prior knowledge.
  427. p_times_uni = 1 / w_dim
  428. p_times = np.full((w_dim, 1), p_times_uni)
  429. x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times),
  430. xtol=1e-06, maxiter=1000)
  431. # use uniform distribution if there is no prior knowledge.
  432. q_times = np.full((1, w_dim), p_times_uni)
  433. return np.dot(q_times, x)
  434. def func_fp(x, p_times, lmda, w_times):
  435. haha = w_times * x
  436. haha = lmda * haha
  437. haha = p_times + haha
  438. return p_times + lmda * np.dot(w_times, x)
  439. ###############################################################################
  440. def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True):
  441. """Calculate walk graph kernels up to n between 2 unlabeled graphs using
  442. spectral decomposition method. Labels will be ignored.
  443. Parameters
  444. ----------
  445. G1, G2 : NetworkX graph
  446. Graphs between which the kernel is calculated.
  447. node_label : string
  448. node attribute used as label.
  449. edge_label : string
  450. edge attribute used as label.
  451. Return
  452. ------
  453. kernel : float
  454. Kernel between 2 graphs.
  455. """
  456. Kmatrix = np.zeros((len(Gn), len(Gn)))
  457. if q == None:
  458. # precompute the spectral decomposition of each graph.
  459. P_list = []
  460. D_list = []
  461. for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if
  462. verbose else Gn):
  463. # don't normalize adjacency matrices if q is a uniform vector. Note
  464. # A actually is the transpose of the adjacency matrix.
  465. A = nx.adjacency_matrix(G, eweight).todense().transpose()
  466. ew, ev = np.linalg.eig(A)
  467. D_list.append(ew)
  468. P_list.append(ev)
  469. # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?
  470. if p == None: # p is uniform distribution as default.
  471. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn]
  472. # q_T_list = [q.T for q in q_list]
  473. def init_worker(q_T_toshare, P_toshare, D_toshare):
  474. global G_q_T, G_P, G_D
  475. G_q_T = q_T_toshare
  476. G_P = P_toshare
  477. G_D = D_toshare
  478. do_partial = partial(wrapper_sd_do, weight, sub_kernel)
  479. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  480. glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs,
  481. verbose=verbose)
  482. # pbar = tqdm(
  483. # total=(1 + len(Gn)) * len(Gn) / 2,
  484. # desc='calculating kernels',
  485. # file=sys.stdout)
  486. # for i in range(0, len(Gn)):
  487. # for j in range(i, len(Gn)):
  488. # result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
  489. # D_list[i], D_list[j], weight, sub_kernel)
  490. # Kmatrix[i][j] = result
  491. # Kmatrix[j][i] = Kmatrix[i][j]
  492. # pbar.update(1)
  493. return Kmatrix
  494. def wrapper_sd_do(weight, sub_kernel, itr):
  495. i = itr[0]
  496. j = itr[1]
  497. return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j],
  498. weight, sub_kernel)
  499. def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):
  500. # use uniform distribution if there is no prior knowledge.
  501. kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
  502. # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs
  503. # kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense()
  504. if sub_kernel == 'exp':
  505. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  506. kmiddle = np.diag(np.exp(weight * D_diag))
  507. elif sub_kernel == 'geo':
  508. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  509. kmiddle = np.diag(weight * D_diag)
  510. kmiddle = np.identity(len(kmiddle)) - weight * kmiddle
  511. kmiddle = np.linalg.inv(kmiddle)
  512. return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]
  513. ###############################################################################
  514. def _randomwalkkernel_kron(G1, G2, node_label, edge_label):
  515. """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method.
  516. Parameters
  517. ----------
  518. G1, G2 : NetworkX graph
  519. Graphs between which the kernel is calculated.
  520. node_label : string
  521. node attribute used as label.
  522. edge_label : string
  523. edge attribute used as label.
  524. Return
  525. ------
  526. kernel : float
  527. Kernel between 2 graphs.
  528. """
  529. pass
  530. ###############################################################################
  531. def getLabels(Gn, node_label, edge_label, directed):
  532. """Get symbolic labels of a graph dataset, where vertex labels are dealt
  533. with by concatenating them to the edge labels of adjacent edges.
  534. """
  535. label_list = []
  536. label_set = set()
  537. for g in Gn:
  538. label_g = {}
  539. for e in g.edges(data=True):
  540. nl1 = g.node[e[0]][node_label]
  541. nl2 = g.node[e[1]][node_label]
  542. if not directed and nl1 > nl2:
  543. nl1, nl2 = nl2, nl1
  544. label = (nl1, e[2][edge_label], nl2)
  545. label_g[(e[0], e[1])] = label
  546. label_list.append(label_g)
  547. label_set = set([l for lg in label_list for l in lg.values()])
  548. return label_list, len(label_set)
  549. def filterGramMatrix(gmt, label_dict, label, directed):
  550. """Compute (the transpose of) the Gram matrix filtered by a label.
  551. """
  552. gmf = np.zeros(gmt.shape)
  553. for (n1, n2), l in label_dict.items():
  554. if l == label:
  555. gmf[n2, n1] = gmt[n2, n1]
  556. if not directed:
  557. gmf[n1, n2] = gmt[n1, n2]
  558. return gmf
  559. def computeVK(g1, g2, ds_attrs, node_kernels, node_label):
  560. '''Compute vertex kernels between vertices of two graphs.
  561. '''
  562. vk_dict = {} # shortest path matrices dict
  563. if ds_attrs['node_labeled']:
  564. # node symb and non-synb labeled
  565. if ds_attrs['node_attr_dim'] > 0:
  566. kn = node_kernels['mix']
  567. for n1 in g1.nodes(data=True):
  568. for n2 in g2.nodes(data=True):
  569. vk_dict[(n1[0], n2[0])] = kn(
  570. n1[1][node_label], n2[1][node_label],
  571. n1[1]['attributes'], n2[1]['attributes'])
  572. # node symb labeled
  573. else:
  574. kn = node_kernels['symb']
  575. for n1 in g1.nodes(data=True):
  576. for n2 in g2.nodes(data=True):
  577. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  578. n2[1][node_label])
  579. else:
  580. # node non-synb labeled
  581. if ds_attrs['node_attr_dim'] > 0:
  582. kn = node_kernels['nsymb']
  583. for n1 in g1.nodes(data=True):
  584. for n2 in g2.nodes(data=True):
  585. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  586. n2[1]['attributes'])
  587. # node unlabeled
  588. else:
  589. pass
  590. return vk_dict
  591. def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):
  592. '''Compute weight matrix of the direct product graph.
  593. '''
  594. w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
  595. w_times = np.zeros((w_dim, w_dim))
  596. if vk_dict: # node labeled
  597. if ds_attrs['is_directed']:
  598. if ds_attrs['edge_labeled']:
  599. # edge symb and non-synb labeled
  600. if ds_attrs['edge_attr_dim'] > 0:
  601. ke = edge_kernels['mix']
  602. for e1 in g1.edges(data=True):
  603. for e2 in g2.edges(data=True):
  604. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  605. e1[2]['attributes'], e2[2]['attributes'])
  606. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  607. e1[1] * nx.number_of_nodes(g2) + e2[1])
  608. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  609. * ek_temp * vk_dict[(e1[1], e2[1])]
  610. # edge symb labeled
  611. else:
  612. ke = edge_kernels['symb']
  613. for e1 in g1.edges(data=True):
  614. for e2 in g2.edges(data=True):
  615. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  616. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  617. e1[1] * nx.number_of_nodes(g2) + e2[1])
  618. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  619. * ek_temp * vk_dict[(e1[1], e2[1])]
  620. else:
  621. # edge non-synb labeled
  622. if ds_attrs['edge_attr_dim'] > 0:
  623. ke = edge_kernels['nsymb']
  624. for e1 in g1.edges(data=True):
  625. for e2 in g2.edges(data=True):
  626. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  627. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  628. e1[1] * nx.number_of_nodes(g2) + e2[1])
  629. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  630. * ek_temp * vk_dict[(e1[1], e2[1])]
  631. # edge unlabeled
  632. else:
  633. for e1 in g1.edges(data=True):
  634. for e2 in g2.edges(data=True):
  635. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  636. e1[1] * nx.number_of_nodes(g2) + e2[1])
  637. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  638. * vk_dict[(e1[1], e2[1])]
  639. else: # undirected
  640. if ds_attrs['edge_labeled']:
  641. # edge symb and non-synb labeled
  642. if ds_attrs['edge_attr_dim'] > 0:
  643. ke = edge_kernels['mix']
  644. for e1 in g1.edges(data=True):
  645. for e2 in g2.edges(data=True):
  646. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  647. e1[2]['attributes'], e2[2]['attributes'])
  648. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  649. e1[1] * nx.number_of_nodes(g2) + e2[1])
  650. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  651. * ek_temp * vk_dict[(e1[1], e2[1])] \
  652. + vk_dict[(e1[0], e2[1])] \
  653. * ek_temp * vk_dict[(e1[1], e2[0])]
  654. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  655. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  656. e1[1] * nx.number_of_nodes(g2) + e2[0])
  657. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  658. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  659. # edge symb labeled
  660. else:
  661. ke = edge_kernels['symb']
  662. for e1 in g1.edges(data=True):
  663. for e2 in g2.edges(data=True):
  664. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  665. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  666. e1[1] * nx.number_of_nodes(g2) + e2[1])
  667. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  668. * ek_temp * vk_dict[(e1[1], e2[1])] \
  669. + vk_dict[(e1[0], e2[1])] \
  670. * ek_temp * vk_dict[(e1[1], e2[0])]
  671. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  672. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  673. e1[1] * nx.number_of_nodes(g2) + e2[0])
  674. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  675. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  676. else:
  677. # edge non-synb labeled
  678. if ds_attrs['edge_attr_dim'] > 0:
  679. ke = edge_kernels['nsymb']
  680. for e1 in g1.edges(data=True):
  681. for e2 in g2.edges(data=True):
  682. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  683. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  684. e1[1] * nx.number_of_nodes(g2) + e2[1])
  685. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  686. * ek_temp * vk_dict[(e1[1], e2[1])] \
  687. + vk_dict[(e1[0], e2[1])] \
  688. * ek_temp * vk_dict[(e1[1], e2[0])]
  689. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  690. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  691. e1[1] * nx.number_of_nodes(g2) + e2[0])
  692. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  693. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  694. # edge unlabeled
  695. else:
  696. for e1 in g1.edges(data=True):
  697. for e2 in g2.edges(data=True):
  698. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  699. e1[1] * nx.number_of_nodes(g2) + e2[1])
  700. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  701. * vk_dict[(e1[1], e2[1])] \
  702. + vk_dict[(e1[0], e2[1])] \
  703. * vk_dict[(e1[1], e2[0])]
  704. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  705. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  706. e1[1] * nx.number_of_nodes(g2) + e2[0])
  707. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  708. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  709. else: # node unlabeled
  710. if ds_attrs['is_directed']:
  711. if ds_attrs['edge_labeled']:
  712. # edge symb and non-synb labeled
  713. if ds_attrs['edge_attr_dim'] > 0:
  714. ke = edge_kernels['mix']
  715. for e1 in g1.edges(data=True):
  716. for e2 in g2.edges(data=True):
  717. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  718. e1[2]['attributes'], e2[2]['attributes'])
  719. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  720. e1[1] * nx.number_of_nodes(g2) + e2[1])
  721. w_times[w_idx] = ek_temp
  722. # edge symb labeled
  723. else:
  724. ke = edge_kernels['symb']
  725. for e1 in g1.edges(data=True):
  726. for e2 in g2.edges(data=True):
  727. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  728. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  729. e1[1] * nx.number_of_nodes(g2) + e2[1])
  730. w_times[w_idx] = ek_temp
  731. else:
  732. # edge non-synb labeled
  733. if ds_attrs['edge_attr_dim'] > 0:
  734. ke = edge_kernels['nsymb']
  735. for e1 in g1.edges(data=True):
  736. for e2 in g2.edges(data=True):
  737. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  738. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  739. e1[1] * nx.number_of_nodes(g2) + e2[1])
  740. w_times[w_idx] = ek_temp
  741. # edge unlabeled
  742. else:
  743. for e1 in g1.edges(data=True):
  744. for e2 in g2.edges(data=True):
  745. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  746. e1[1] * nx.number_of_nodes(g2) + e2[1])
  747. w_times[w_idx] = 1
  748. else: # undirected
  749. if ds_attrs['edge_labeled']:
  750. # edge symb and non-synb labeled
  751. if ds_attrs['edge_attr_dim'] > 0:
  752. ke = edge_kernels['mix']
  753. for e1 in g1.edges(data=True):
  754. for e2 in g2.edges(data=True):
  755. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  756. e1[2]['attributes'], e2[2]['attributes'])
  757. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  758. e1[1] * nx.number_of_nodes(g2) + e2[1])
  759. w_times[w_idx] = ek_temp
  760. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  761. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  762. e1[1] * nx.number_of_nodes(g2) + e2[0])
  763. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  764. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  765. # edge symb labeled
  766. else:
  767. ke = edge_kernels['symb']
  768. for e1 in g1.edges(data=True):
  769. for e2 in g2.edges(data=True):
  770. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  771. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  772. e1[1] * nx.number_of_nodes(g2) + e2[1])
  773. w_times[w_idx] = ek_temp
  774. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  775. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  776. e1[1] * nx.number_of_nodes(g2) + e2[0])
  777. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  778. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  779. else:
  780. # edge non-synb labeled
  781. if ds_attrs['edge_attr_dim'] > 0:
  782. ke = edge_kernels['nsymb']
  783. for e1 in g1.edges(data=True):
  784. for e2 in g2.edges(data=True):
  785. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  786. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  787. e1[1] * nx.number_of_nodes(g2) + e2[1])
  788. w_times[w_idx] = ek_temp
  789. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  790. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  791. e1[1] * nx.number_of_nodes(g2) + e2[0])
  792. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  793. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  794. # edge unlabeled
  795. else:
  796. for e1 in g1.edges(data=True):
  797. for e2 in g2.edges(data=True):
  798. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  799. e1[1] * nx.number_of_nodes(g2) + e2[1])
  800. w_times[w_idx] = 1
  801. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  802. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  803. e1[1] * nx.number_of_nodes(g2) + e2[0])
  804. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  805. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  806. return w_times, w_dim

A Python package for graph kernels, graph edit distances and graph pre-image problem.