You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

randomWalkKernel.py 39 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845
  1. """
  2. @author: linlin
  3. @references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research, 11(Apr):1201–1242, 2010.
  4. """
  5. import sys
  6. sys.path.insert(0, "../")
  7. import time
  8. from functools import partial
  9. from tqdm import tqdm
  10. import networkx as nx
  11. import numpy as np
  12. from scipy.sparse import identity, kron
  13. from scipy.sparse.linalg import cg
  14. from scipy.optimize import fixed_point
  15. from pygraph.utils.graphdataset import get_dataset_attributes
  16. from pygraph.utils.parallel import parallel_gm
  17. def randomwalkkernel(*args,
  18. # params for all method.
  19. compute_method=None,
  20. weight=1,
  21. p=None,
  22. q=None,
  23. edge_weight=None,
  24. # params for conjugate and fp method.
  25. node_kernels=None,
  26. edge_kernels=None,
  27. node_label='atom',
  28. edge_label='bond_type',
  29. # params for spectral method.
  30. sub_kernel=None,
  31. n_jobs=None,
  32. verbose=True):
  33. """Calculate random walk graph kernels.
  34. Parameters
  35. ----------
  36. Gn : List of NetworkX graph
  37. List of graphs between which the kernels are calculated.
  38. /
  39. G1, G2 : NetworkX graphs
  40. 2 graphs between which the kernel is calculated.
  41. node_label : string
  42. node attribute used as label. The default node label is atom.
  43. edge_label : string
  44. edge attribute used as label. The default edge label is bond_type.
  45. h : integer
  46. Longest length of walks.
  47. method : string
  48. Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'.
  49. Return
  50. ------
  51. Kmatrix : Numpy matrix
  52. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  53. """
  54. compute_method = compute_method.lower()
  55. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  56. Gn = [g.copy() for g in Gn]
  57. eweight = None
  58. if edge_weight == None:
  59. if verbose:
  60. print('\n None edge weight specified. Set all weight to 1.\n')
  61. else:
  62. try:
  63. some_weight = list(
  64. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  65. if isinstance(some_weight, float) or isinstance(some_weight, int):
  66. eweight = edge_weight
  67. else:
  68. if verbose:
  69. print('\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  70. % edge_weight)
  71. except:
  72. if verbose:
  73. print('\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  74. % edge_weight)
  75. ds_attrs = get_dataset_attributes(
  76. Gn,
  77. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  78. 'edge_attr_dim', 'is_directed'],
  79. node_label=node_label,
  80. edge_label=edge_label)
  81. # remove graphs with no edges, as no walk can be found in their structures,
  82. # so the weight matrix between such a graph and itself might be zero.
  83. len_gn = len(Gn)
  84. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  85. idx = [G[0] for G in Gn]
  86. Gn = [G[1] for G in Gn]
  87. if len(Gn) != len_gn:
  88. if verbose:
  89. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  90. (len_gn - len(Gn)))
  91. start_time = time.time()
  92. # # get vertex and edge concatenated labels for each graph
  93. # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
  94. # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])
  95. if compute_method == 'sylvester':
  96. if verbose:
  97. import warnings
  98. warnings.warn('All labels are ignored.')
  99. Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose)
  100. elif compute_method == 'conjugate':
  101. Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels,
  102. edge_kernels, node_label, edge_label,
  103. eweight, n_jobs, verbose=verbose)
  104. elif compute_method == 'fp':
  105. Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels,
  106. edge_kernels, node_label, edge_label,
  107. eweight, n_jobs, verbose=verbose)
  108. elif compute_method == 'spectral':
  109. if verbose:
  110. import warnings
  111. warnings.warn('All labels are ignored. Only works for undirected graphs.')
  112. Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel,
  113. eweight, n_jobs, verbose=verbose)
  114. elif compute_method == 'kron':
  115. pass
  116. for i in range(0, len(Gn)):
  117. for j in range(i, len(Gn)):
  118. Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
  119. node_label, edge_label)
  120. Kmatrix[j][i] = Kmatrix[i][j]
  121. else:
  122. raise Exception(
  123. 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
  124. )
  125. run_time = time.time() - start_time
  126. if verbose:
  127. print("\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
  128. % (len(Gn), run_time))
  129. return Kmatrix, run_time, idx
  130. ###############################################################################
  131. def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):
  132. """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.
  133. Parameters
  134. ----------
  135. G1, G2 : NetworkX graph
  136. Graphs between which the kernel is calculated.
  137. node_label : string
  138. node attribute used as label.
  139. edge_label : string
  140. edge attribute used as label.
  141. Return
  142. ------
  143. kernel : float
  144. Kernel between 2 graphs.
  145. """
  146. Kmatrix = np.zeros((len(Gn), len(Gn)))
  147. if q == None:
  148. # don't normalize adjacency matrices if q is a uniform vector. Note
  149. # A_wave_list accually contains the transposes of the adjacency matrices.
  150. A_wave_list = [
  151. nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  152. (tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) if
  153. verbose else Gn)
  154. ]
  155. # # normalized adjacency matrices
  156. # A_wave_list = []
  157. # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
  158. # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
  159. # norm = A_tilde.sum(axis=0)
  160. # norm[norm == 0] = 1
  161. # A_wave_list.append(A_tilde / norm)
  162. if p == None: # p is uniform distribution as default.
  163. def init_worker(Awl_toshare):
  164. global G_Awl
  165. G_Awl = Awl_toshare
  166. do_partial = partial(wrapper_se_do, lmda)
  167. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  168. glbv=(A_wave_list,), n_jobs=n_jobs, verbose=verbose)
  169. # pbar = tqdm(
  170. # total=(1 + len(Gn)) * len(Gn) / 2,
  171. # desc='calculating kernels',
  172. # file=sys.stdout)
  173. # for i in range(0, len(Gn)):
  174. # for j in range(i, len(Gn)):
  175. # S = lmda * A_wave_list[j]
  176. # T_t = A_wave_list[i]
  177. # # use uniform distribution if there is no prior knowledge.
  178. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  179. # p_times_uni = 1 / nb_pd
  180. # M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
  181. # X = dlyap(S, T_t, M0)
  182. # X = np.reshape(X, (-1, 1), order='F')
  183. # # use uniform distribution if there is no prior knowledge.
  184. # q_times = np.full((1, nb_pd), p_times_uni)
  185. # Kmatrix[i][j] = np.dot(q_times, X)
  186. # Kmatrix[j][i] = Kmatrix[i][j]
  187. # pbar.update(1)
  188. return Kmatrix
  189. def wrapper_se_do(lmda, itr):
  190. i = itr[0]
  191. j = itr[1]
  192. return i, j, _se_do(G_Awl[i], G_Awl[j], lmda)
  193. def _se_do(A_wave1, A_wave2, lmda):
  194. from control import dlyap
  195. S = lmda * A_wave2
  196. T_t = A_wave1
  197. # use uniform distribution if there is no prior knowledge.
  198. nb_pd = len(A_wave1) * len(A_wave2)
  199. p_times_uni = 1 / nb_pd
  200. M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni)
  201. X = dlyap(S, T_t, M0)
  202. X = np.reshape(X, (-1, 1), order='F')
  203. # use uniform distribution if there is no prior knowledge.
  204. q_times = np.full((1, nb_pd), p_times_uni)
  205. return np.dot(q_times, X)
  206. ###############################################################################
  207. def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  208. node_label, edge_label, eweight, n_jobs, verbose=True):
  209. """Calculate walk graph kernels up to n between 2 graphs using conjugate method.
  210. Parameters
  211. ----------
  212. G1, G2 : NetworkX graph
  213. Graphs between which the kernel is calculated.
  214. node_label : string
  215. node attribute used as label.
  216. edge_label : string
  217. edge attribute used as label.
  218. Return
  219. ------
  220. kernel : float
  221. Kernel between 2 graphs.
  222. """
  223. Kmatrix = np.zeros((len(Gn), len(Gn)))
  224. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  225. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
  226. # # this is faster from unlabeled graphs. @todo: why?
  227. # if q == None:
  228. # # don't normalize adjacency matrices if q is a uniform vector. Note
  229. # # A_wave_list accually contains the transposes of the adjacency matrices.
  230. # A_wave_list = [
  231. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  232. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  233. # ]
  234. # if p == None: # p is uniform distribution as default.
  235. # def init_worker(Awl_toshare):
  236. # global G_Awl
  237. # G_Awl = Awl_toshare
  238. # do_partial = partial(wrapper_cg_unlabled_do, lmda)
  239. # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  240. # glbv=(A_wave_list,), n_jobs=n_jobs)
  241. # else:
  242. # reindex nodes using consecutive integers for convenience of kernel calculation.
  243. Gn = [nx.convert_node_labels_to_integers(
  244. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  245. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  246. if p == None and q == None: # p and q are uniform distributions as default.
  247. def init_worker(gn_toshare):
  248. global G_gn
  249. G_gn = gn_toshare
  250. do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
  251. node_label, edge_kernels, edge_label, lmda)
  252. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  253. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  254. # pbar = tqdm(
  255. # total=(1 + len(Gn)) * len(Gn) / 2,
  256. # desc='calculating kernels',
  257. # file=sys.stdout)
  258. # for i in range(0, len(Gn)):
  259. # for j in range(i, len(Gn)):
  260. # result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
  261. # node_label, edge_kernels, edge_label, lmda)
  262. # Kmatrix[i][j] = result
  263. # Kmatrix[j][i] = Kmatrix[i][j]
  264. # pbar.update(1)
  265. return Kmatrix
  266. def wrapper_cg_unlabled_do(lmda, itr):
  267. i = itr[0]
  268. j = itr[1]
  269. return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda)
  270. def _cg_unlabled_do(A_wave1, A_wave2, lmda):
  271. nb_pd = len(A_wave1) * len(A_wave2)
  272. p_times_uni = 1 / nb_pd
  273. w_times = kron(A_wave1, A_wave2).todense()
  274. A = identity(w_times.shape[0]) - w_times * lmda
  275. b = np.full((nb_pd, 1), p_times_uni)
  276. x, _ = cg(A, b)
  277. # use uniform distribution if there is no prior knowledge.
  278. q_times = np.full((1, nb_pd), p_times_uni)
  279. return np.dot(q_times, x)
  280. def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  281. edge_label, lmda, itr):
  282. i = itr[0]
  283. j = itr[1]
  284. return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  285. node_label, edge_kernels, edge_label, lmda)
  286. def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  287. edge_kernels, edge_label, lmda):
  288. # Frist, compute kernels between all pairs of nodes, method borrowed
  289. # from FCSP. It is faster than directly computing all edge kernels
  290. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  291. # graphs compared, which is the most case we went though. For very
  292. # sparse graphs, this would be slow.
  293. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  294. # Compute weight matrix of the direct product graph.
  295. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  296. edge_kernels, edge_label)
  297. # use uniform distribution if there is no prior knowledge.
  298. p_times_uni = 1 / w_dim
  299. A = identity(w_times.shape[0]) - w_times * lmda
  300. b = np.full((w_dim, 1), p_times_uni)
  301. x, _ = cg(A, b)
  302. # use uniform distribution if there is no prior knowledge.
  303. q_times = np.full((1, w_dim), p_times_uni)
  304. return np.dot(q_times, x)
  305. ###############################################################################
  306. def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  307. node_label, edge_label, eweight, n_jobs, verbose=True):
  308. """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.
  309. Parameters
  310. ----------
  311. G1, G2 : NetworkX graph
  312. Graphs between which the kernel is calculated.
  313. node_label : string
  314. node attribute used as label.
  315. edge_label : string
  316. edge attribute used as label.
  317. Return
  318. ------
  319. kernel : float
  320. Kernel between 2 graphs.
  321. """
  322. Kmatrix = np.zeros((len(Gn), len(Gn)))
  323. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  324. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
  325. # # this is faster from unlabeled graphs. @todo: why?
  326. # if q == None:
  327. # # don't normalize adjacency matrices if q is a uniform vector. Note
  328. # # A_wave_list accually contains the transposes of the adjacency matrices.
  329. # A_wave_list = [
  330. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  331. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  332. # ]
  333. # if p == None: # p is uniform distribution as default.
  334. # pbar = tqdm(
  335. # total=(1 + len(Gn)) * len(Gn) / 2,
  336. # desc='calculating kernels',
  337. # file=sys.stdout)
  338. # for i in range(0, len(Gn)):
  339. # for j in range(i, len(Gn)):
  340. # # use uniform distribution if there is no prior knowledge.
  341. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  342. # p_times_uni = 1 / nb_pd
  343. # w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
  344. # p_times = np.full((nb_pd, 1), p_times_uni)
  345. # x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
  346. # # use uniform distribution if there is no prior knowledge.
  347. # q_times = np.full((1, nb_pd), p_times_uni)
  348. # Kmatrix[i][j] = np.dot(q_times, x)
  349. # Kmatrix[j][i] = Kmatrix[i][j]
  350. # pbar.update(1)
  351. # else:
  352. # reindex nodes using consecutive integers for convenience of kernel calculation.
  353. Gn = [nx.convert_node_labels_to_integers(
  354. g, first_label=0, label_attribute='label_orignal') for g in (tqdm(
  355. Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)]
  356. if p == None and q == None: # p and q are uniform distributions as default.
  357. def init_worker(gn_toshare):
  358. global G_gn
  359. G_gn = gn_toshare
  360. do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
  361. node_label, edge_kernels, edge_label, lmda)
  362. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  363. glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)
  364. return Kmatrix
  365. def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  366. edge_label, lmda, itr):
  367. i = itr[0]
  368. j = itr[1]
  369. return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  370. node_label, edge_kernels, edge_label, lmda)
  371. def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  372. edge_kernels, edge_label, lmda):
  373. # Frist, compute kernels between all pairs of nodes, method borrowed
  374. # from FCSP. It is faster than directly computing all edge kernels
  375. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  376. # graphs compared, which is the most case we went though. For very
  377. # sparse graphs, this would be slow.
  378. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  379. # Compute weight matrix of the direct product graph.
  380. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  381. edge_kernels, edge_label)
  382. # use uniform distribution if there is no prior knowledge.
  383. p_times_uni = 1 / w_dim
  384. p_times = np.full((w_dim, 1), p_times_uni)
  385. x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times),
  386. xtol=1e-06, maxiter=1000)
  387. # use uniform distribution if there is no prior knowledge.
  388. q_times = np.full((1, w_dim), p_times_uni)
  389. return np.dot(q_times, x)
  390. def func_fp(x, p_times, lmda, w_times):
  391. haha = w_times * x
  392. haha = lmda * haha
  393. haha = p_times + haha
  394. return p_times + lmda * np.dot(w_times, x)
  395. ###############################################################################
  396. def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True):
  397. """Calculate walk graph kernels up to n between 2 unlabeled graphs using
  398. spectral decomposition method. Labels will be ignored.
  399. Parameters
  400. ----------
  401. G1, G2 : NetworkX graph
  402. Graphs between which the kernel is calculated.
  403. node_label : string
  404. node attribute used as label.
  405. edge_label : string
  406. edge attribute used as label.
  407. Return
  408. ------
  409. kernel : float
  410. Kernel between 2 graphs.
  411. """
  412. Kmatrix = np.zeros((len(Gn), len(Gn)))
  413. if q == None:
  414. # precompute the spectral decomposition of each graph.
  415. P_list = []
  416. D_list = []
  417. for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if
  418. verbose else Gn):
  419. # don't normalize adjacency matrices if q is a uniform vector. Note
  420. # A accually is the transpose of the adjacency matrix.
  421. A = nx.adjacency_matrix(G, eweight).todense().transpose()
  422. ew, ev = np.linalg.eig(A)
  423. D_list.append(ew)
  424. P_list.append(ev)
  425. # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?
  426. if p == None: # p is uniform distribution as default.
  427. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn]
  428. # q_T_list = [q.T for q in q_list]
  429. def init_worker(q_T_toshare, P_toshare, D_toshare):
  430. global G_q_T, G_P, G_D
  431. G_q_T = q_T_toshare
  432. G_P = P_toshare
  433. G_D = D_toshare
  434. do_partial = partial(wrapper_sd_do, weight, sub_kernel)
  435. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  436. glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs,
  437. verbose=verbose)
  438. # pbar = tqdm(
  439. # total=(1 + len(Gn)) * len(Gn) / 2,
  440. # desc='calculating kernels',
  441. # file=sys.stdout)
  442. # for i in range(0, len(Gn)):
  443. # for j in range(i, len(Gn)):
  444. # result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
  445. # D_list[i], D_list[j], weight, sub_kernel)
  446. # Kmatrix[i][j] = result
  447. # Kmatrix[j][i] = Kmatrix[i][j]
  448. # pbar.update(1)
  449. return Kmatrix
  450. def wrapper_sd_do(weight, sub_kernel, itr):
  451. i = itr[0]
  452. j = itr[1]
  453. return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j],
  454. weight, sub_kernel)
  455. def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):
  456. # use uniform distribution if there is no prior knowledge.
  457. kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
  458. # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs
  459. # kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense()
  460. if sub_kernel == 'exp':
  461. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  462. kmiddle = np.diag(np.exp(weight * D_diag))
  463. elif sub_kernel == 'geo':
  464. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  465. kmiddle = np.diag(weight * D_diag)
  466. kmiddle = np.identity(len(kmiddle)) - weight * kmiddle
  467. kmiddle = np.linalg.inv(kmiddle)
  468. return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]
  469. ###############################################################################
  470. def _randomwalkkernel_kron(G1, G2, node_label, edge_label):
  471. """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method.
  472. Parameters
  473. ----------
  474. G1, G2 : NetworkX graph
  475. Graphs between which the kernel is calculated.
  476. node_label : string
  477. node attribute used as label.
  478. edge_label : string
  479. edge attribute used as label.
  480. Return
  481. ------
  482. kernel : float
  483. Kernel between 2 graphs.
  484. """
  485. pass
  486. ###############################################################################
  487. def getLabels(Gn, node_label, edge_label, directed):
  488. """Get symbolic labels of a graph dataset, where vertex labels are dealt
  489. with by concatenating them to the edge labels of adjacent edges.
  490. """
  491. label_list = []
  492. label_set = set()
  493. for g in Gn:
  494. label_g = {}
  495. for e in g.edges(data=True):
  496. nl1 = g.node[e[0]][node_label]
  497. nl2 = g.node[e[1]][node_label]
  498. if not directed and nl1 > nl2:
  499. nl1, nl2 = nl2, nl1
  500. label = (nl1, e[2][edge_label], nl2)
  501. label_g[(e[0], e[1])] = label
  502. label_list.append(label_g)
  503. label_set = set([l for lg in label_list for l in lg.values()])
  504. return label_list, len(label_set)
  505. def filterGramMatrix(gmt, label_dict, label, directed):
  506. """Compute (the transpose of) the Gram matrix filtered by a label.
  507. """
  508. gmf = np.zeros(gmt.shape)
  509. for (n1, n2), l in label_dict.items():
  510. if l == label:
  511. gmf[n2, n1] = gmt[n2, n1]
  512. if not directed:
  513. gmf[n1, n2] = gmt[n1, n2]
  514. return gmf
  515. def computeVK(g1, g2, ds_attrs, node_kernels, node_label):
  516. '''Compute vertex kernels between vertices of two graphs.
  517. '''
  518. vk_dict = {} # shortest path matrices dict
  519. if ds_attrs['node_labeled']:
  520. # node symb and non-synb labeled
  521. if ds_attrs['node_attr_dim'] > 0:
  522. kn = node_kernels['mix']
  523. for n1 in g1.nodes(data=True):
  524. for n2 in g2.nodes(data=True):
  525. vk_dict[(n1[0], n2[0])] = kn(
  526. n1[1][node_label], n2[1][node_label],
  527. n1[1]['attributes'], n2[1]['attributes'])
  528. # node symb labeled
  529. else:
  530. kn = node_kernels['symb']
  531. for n1 in g1.nodes(data=True):
  532. for n2 in g2.nodes(data=True):
  533. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  534. n2[1][node_label])
  535. else:
  536. # node non-synb labeled
  537. if ds_attrs['node_attr_dim'] > 0:
  538. kn = node_kernels['nsymb']
  539. for n1 in g1.nodes(data=True):
  540. for n2 in g2.nodes(data=True):
  541. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  542. n2[1]['attributes'])
  543. # node unlabeled
  544. else:
  545. pass
  546. return vk_dict
  547. def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):
  548. '''Compute weight matrix of the direct product graph.
  549. '''
  550. w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
  551. w_times = np.zeros((w_dim, w_dim))
  552. if vk_dict: # node labeled
  553. if ds_attrs['is_directed']:
  554. if ds_attrs['edge_labeled']:
  555. # edge symb and non-synb labeled
  556. if ds_attrs['edge_attr_dim'] > 0:
  557. ke = edge_kernels['mix']
  558. for e1 in g1.edges(data=True):
  559. for e2 in g2.edges(data=True):
  560. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  561. e1[2]['attributes'], e2[2]['attributes'])
  562. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  563. e1[1] * nx.number_of_nodes(g2) + e2[1])
  564. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  565. * ek_temp * vk_dict[(e1[1], e2[1])]
  566. # edge symb labeled
  567. else:
  568. ke = edge_kernels['symb']
  569. for e1 in g1.edges(data=True):
  570. for e2 in g2.edges(data=True):
  571. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  572. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  573. e1[1] * nx.number_of_nodes(g2) + e2[1])
  574. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  575. * ek_temp * vk_dict[(e1[1], e2[1])]
  576. else:
  577. # edge non-synb labeled
  578. if ds_attrs['edge_attr_dim'] > 0:
  579. ke = edge_kernels['nsymb']
  580. for e1 in g1.edges(data=True):
  581. for e2 in g2.edges(data=True):
  582. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  583. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  584. e1[1] * nx.number_of_nodes(g2) + e2[1])
  585. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  586. * ek_temp * vk_dict[(e1[1], e2[1])]
  587. # edge unlabeled
  588. else:
  589. for e1 in g1.edges(data=True):
  590. for e2 in g2.edges(data=True):
  591. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  592. e1[1] * nx.number_of_nodes(g2) + e2[1])
  593. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  594. * vk_dict[(e1[1], e2[1])]
  595. else: # undirected
  596. if ds_attrs['edge_labeled']:
  597. # edge symb and non-synb labeled
  598. if ds_attrs['edge_attr_dim'] > 0:
  599. ke = edge_kernels['mix']
  600. for e1 in g1.edges(data=True):
  601. for e2 in g2.edges(data=True):
  602. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  603. e1[2]['attributes'], e2[2]['attributes'])
  604. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  605. e1[1] * nx.number_of_nodes(g2) + e2[1])
  606. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  607. * ek_temp * vk_dict[(e1[1], e2[1])] \
  608. + vk_dict[(e1[0], e2[1])] \
  609. * ek_temp * vk_dict[(e1[1], e2[0])]
  610. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  611. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  612. e1[1] * nx.number_of_nodes(g2) + e2[0])
  613. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  614. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  615. # edge symb labeled
  616. else:
  617. ke = edge_kernels['symb']
  618. for e1 in g1.edges(data=True):
  619. for e2 in g2.edges(data=True):
  620. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  621. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  622. e1[1] * nx.number_of_nodes(g2) + e2[1])
  623. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  624. * ek_temp * vk_dict[(e1[1], e2[1])] \
  625. + vk_dict[(e1[0], e2[1])] \
  626. * ek_temp * vk_dict[(e1[1], e2[0])]
  627. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  628. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  629. e1[1] * nx.number_of_nodes(g2) + e2[0])
  630. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  631. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  632. else:
  633. # edge non-synb labeled
  634. if ds_attrs['edge_attr_dim'] > 0:
  635. ke = edge_kernels['nsymb']
  636. for e1 in g1.edges(data=True):
  637. for e2 in g2.edges(data=True):
  638. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  639. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  640. e1[1] * nx.number_of_nodes(g2) + e2[1])
  641. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  642. * ek_temp * vk_dict[(e1[1], e2[1])] \
  643. + vk_dict[(e1[0], e2[1])] \
  644. * ek_temp * vk_dict[(e1[1], e2[0])]
  645. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  646. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  647. e1[1] * nx.number_of_nodes(g2) + e2[0])
  648. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  649. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  650. # edge unlabeled
  651. else:
  652. for e1 in g1.edges(data=True):
  653. for e2 in g2.edges(data=True):
  654. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  655. e1[1] * nx.number_of_nodes(g2) + e2[1])
  656. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  657. * vk_dict[(e1[1], e2[1])] \
  658. + vk_dict[(e1[0], e2[1])] \
  659. * vk_dict[(e1[1], e2[0])]
  660. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  661. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  662. e1[1] * nx.number_of_nodes(g2) + e2[0])
  663. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  664. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  665. else: # node unlabeled
  666. if ds_attrs['is_directed']:
  667. if ds_attrs['edge_labeled']:
  668. # edge symb and non-synb labeled
  669. if ds_attrs['edge_attr_dim'] > 0:
  670. ke = edge_kernels['mix']
  671. for e1 in g1.edges(data=True):
  672. for e2 in g2.edges(data=True):
  673. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  674. e1[2]['attributes'], e2[2]['attributes'])
  675. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  676. e1[1] * nx.number_of_nodes(g2) + e2[1])
  677. w_times[w_idx] = ek_temp
  678. # edge symb labeled
  679. else:
  680. ke = edge_kernels['symb']
  681. for e1 in g1.edges(data=True):
  682. for e2 in g2.edges(data=True):
  683. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  684. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  685. e1[1] * nx.number_of_nodes(g2) + e2[1])
  686. w_times[w_idx] = ek_temp
  687. else:
  688. # edge non-synb labeled
  689. if ds_attrs['edge_attr_dim'] > 0:
  690. ke = edge_kernels['nsymb']
  691. for e1 in g1.edges(data=True):
  692. for e2 in g2.edges(data=True):
  693. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  694. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  695. e1[1] * nx.number_of_nodes(g2) + e2[1])
  696. w_times[w_idx] = ek_temp
  697. # edge unlabeled
  698. else:
  699. for e1 in g1.edges(data=True):
  700. for e2 in g2.edges(data=True):
  701. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  702. e1[1] * nx.number_of_nodes(g2) + e2[1])
  703. w_times[w_idx] = 1
  704. else: # undirected
  705. if ds_attrs['edge_labeled']:
  706. # edge symb and non-synb labeled
  707. if ds_attrs['edge_attr_dim'] > 0:
  708. ke = edge_kernels['mix']
  709. for e1 in g1.edges(data=True):
  710. for e2 in g2.edges(data=True):
  711. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  712. e1[2]['attributes'], e2[2]['attributes'])
  713. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  714. e1[1] * nx.number_of_nodes(g2) + e2[1])
  715. w_times[w_idx] = ek_temp
  716. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  717. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  718. e1[1] * nx.number_of_nodes(g2) + e2[0])
  719. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  720. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  721. # edge symb labeled
  722. else:
  723. ke = edge_kernels['symb']
  724. for e1 in g1.edges(data=True):
  725. for e2 in g2.edges(data=True):
  726. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  727. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  728. e1[1] * nx.number_of_nodes(g2) + e2[1])
  729. w_times[w_idx] = ek_temp
  730. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  731. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  732. e1[1] * nx.number_of_nodes(g2) + e2[0])
  733. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  734. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  735. else:
  736. # edge non-synb labeled
  737. if ds_attrs['edge_attr_dim'] > 0:
  738. ke = edge_kernels['nsymb']
  739. for e1 in g1.edges(data=True):
  740. for e2 in g2.edges(data=True):
  741. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  742. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  743. e1[1] * nx.number_of_nodes(g2) + e2[1])
  744. w_times[w_idx] = ek_temp
  745. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  746. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  747. e1[1] * nx.number_of_nodes(g2) + e2[0])
  748. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  749. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  750. # edge unlabeled
  751. else:
  752. for e1 in g1.edges(data=True):
  753. for e2 in g2.edges(data=True):
  754. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  755. e1[1] * nx.number_of_nodes(g2) + e2[1])
  756. w_times[w_idx] = 1
  757. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  758. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  759. e1[1] * nx.number_of_nodes(g2) + e2[0])
  760. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  761. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  762. return w_times, w_dim

A Python package for graph kernels, graph edit distances and graph pre-image problem.