You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

weisfeilerLehmanKernel.py 20 kB

5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. """
  2. @author: linlin
  3. @references:
  4. [1] Shervashidze N, Schweitzer P, Leeuwen EJ, Mehlhorn K, Borgwardt KM.
  5. Weisfeiler-lehman graph kernels. Journal of Machine Learning Research.
  6. 2011;12(Sep):2539-61.
  7. """
  8. import sys
  9. from collections import Counter
  10. from functools import partial
  11. import time
  12. #from multiprocessing import Pool
  13. from tqdm import tqdm
  14. import networkx as nx
  15. import numpy as np
  16. #from gklearn.kernels.pathKernel import pathkernel
  17. from gklearn.utils.graphdataset import get_dataset_attributes
  18. from gklearn.utils.parallel import parallel_gm
  19. # @todo: support edge kernel, sp kernel, user-defined kernel.
  20. def weisfeilerlehmankernel(*args,
  21. node_label='atom',
  22. edge_label='bond_type',
  23. height=0,
  24. base_kernel='subtree',
  25. parallel=None,
  26. n_jobs=None,
  27. verbose=True):
  28. """Calculate Weisfeiler-Lehman kernels between graphs.
  29. Parameters
  30. ----------
  31. Gn : List of NetworkX graph
  32. List of graphs between which the kernels are calculated.
  33. G1, G2 : NetworkX graphs
  34. Two graphs between which the kernel is calculated.
  35. node_label : string
  36. Node attribute used as label. The default node label is atom.
  37. edge_label : string
  38. Edge attribute used as label. The default edge label is bond_type.
  39. height : int
  40. Subtree height.
  41. base_kernel : string
  42. Base kernel used in each iteration of WL kernel. Only default 'subtree'
  43. kernel can be applied for now.
  44. parallel : None
  45. Which paralleliztion method is applied to compute the kernel. No
  46. parallelization can be applied for now.
  47. n_jobs : int
  48. Number of jobs for parallelization. The default is to use all
  49. computational cores. This argument is only valid when one of the
  50. parallelization method is applied and can be ignored for now.
  51. Return
  52. ------
  53. Kmatrix : Numpy matrix
  54. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  55. Notes
  56. -----
  57. This function now supports WL subtree kernel only.
  58. """
  59. # The default base
  60. # kernel is subtree kernel. For user-defined kernel, base_kernel is the
  61. # name of the base kernel function used in each iteration of WL kernel.
  62. # This function returns a Numpy matrix, each element of which is the
  63. # user-defined Weisfeiler-Lehman kernel between 2 praphs.
  64. # pre-process
  65. base_kernel = base_kernel.lower()
  66. Gn = args[0] if len(args) == 1 else [args[0], args[1]] # arrange all graphs in a list
  67. Gn = [g.copy() for g in Gn]
  68. ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'],
  69. node_label=node_label)
  70. if not ds_attrs['node_labeled']:
  71. for G in Gn:
  72. nx.set_node_attributes(G, '0', 'atom')
  73. start_time = time.time()
  74. # for WL subtree kernel
  75. if base_kernel == 'subtree':
  76. Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose)
  77. # for WL shortest path kernel
  78. elif base_kernel == 'sp':
  79. Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height)
  80. # for WL edge kernel
  81. elif base_kernel == 'edge':
  82. Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height)
  83. # for user defined base kernel
  84. else:
  85. Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel)
  86. run_time = time.time() - start_time
  87. if verbose:
  88. print("\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---"
  89. % (base_kernel, len(args[0]), run_time))
  90. return Kmatrix, run_time
  91. def _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, verbose):
  92. """Calculate Weisfeiler-Lehman kernels between graphs.
  93. Parameters
  94. ----------
  95. Gn : List of NetworkX graph
  96. List of graphs between which the kernels are calculated.
  97. node_label : string
  98. node attribute used as label.
  99. edge_label : string
  100. edge attribute used as label.
  101. height : int
  102. wl height.
  103. Return
  104. ------
  105. Kmatrix : Numpy matrix
  106. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  107. """
  108. height = int(height)
  109. Kmatrix = np.zeros((len(Gn), len(Gn)))
  110. # initial for height = 0
  111. all_num_of_each_label = [] # number of occurence of each label in each graph in this iteration
  112. # for each graph
  113. for G in Gn:
  114. # get the set of original labels
  115. labels_ori = list(nx.get_node_attributes(G, node_label).values())
  116. # number of occurence of each label in G
  117. all_num_of_each_label.append(dict(Counter(labels_ori)))
  118. # calculate subtree kernel with the 0th iteration and add it to the final kernel
  119. compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
  120. # iterate each height
  121. for h in range(1, height + 1):
  122. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  123. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  124. # all_labels_ori = set() # all unique orignal labels in all graphs in this iteration
  125. all_num_of_each_label = [] # number of occurence of each label in G
  126. # # for each graph
  127. # # ---- use pool.imap_unordered to parallel and track progress. ----
  128. # pool = Pool(n_jobs)
  129. # itr = zip(Gn, range(0, len(Gn)))
  130. # if len(Gn) < 100 * n_jobs:
  131. # chunksize = int(len(Gn) / n_jobs) + 1
  132. # else:
  133. # chunksize = 100
  134. # all_multisets_list = [[] for _ in range(len(Gn))]
  135. ## set_unique_list = [[] for _ in range(len(Gn))]
  136. # get_partial = partial(wrapper_wl_iteration, node_label)
  137. ## if verbose:
  138. ## iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
  139. ## desc='wl iteration', file=sys.stdout)
  140. ## else:
  141. # iterator = pool.imap_unordered(get_partial, itr, chunksize)
  142. # for i, all_multisets in iterator:
  143. # all_multisets_list[i] = all_multisets
  144. ## set_unique_list[i] = set_unique
  145. ## all_set_unique = all_set_unique | set(set_unique)
  146. # pool.close()
  147. # pool.join()
  148. # all_set_unique = set()
  149. # for uset in all_multisets_list:
  150. # all_set_unique = all_set_unique | set(uset)
  151. #
  152. # all_set_unique = list(all_set_unique)
  153. ## # a dictionary mapping original labels to new ones.
  154. ## set_compressed = {}
  155. ## for idx, uset in enumerate(all_set_unique):
  156. ## set_compressed.update({uset: idx})
  157. #
  158. # for ig, G in enumerate(Gn):
  159. #
  160. ## # a dictionary mapping original labels to new ones.
  161. ## set_compressed = {}
  162. ## # if a label occured before, assign its former compressed label,
  163. ## # else assign the number of labels occured + 1 as the compressed label.
  164. ## for value in set_unique_list[i]:
  165. ## if uset in all_set_unique:
  166. ## set_compressed.update({uset: all_set_compressed[value]})
  167. ## else:
  168. ## set_compressed.update({value: str(num_of_labels_occured + 1)})
  169. ## num_of_labels_occured += 1
  170. #
  171. ## all_set_compressed.update(set_compressed)
  172. #
  173. # # relabel nodes
  174. # for idx, node in enumerate(G.nodes()):
  175. # G.nodes[node][node_label] = all_set_unique.index(all_multisets_list[ig][idx])
  176. #
  177. # # get the set of compressed labels
  178. # labels_comp = list(nx.get_node_attributes(G, node_label).values())
  179. ## all_labels_ori.update(labels_comp)
  180. # all_num_of_each_label[ig] = dict(Counter(labels_comp))
  181. # all_set_unique = list(all_set_unique)
  182. # @todo: parallel this part.
  183. for idx, G in enumerate(Gn):
  184. all_multisets = []
  185. for node, attrs in G.nodes(data=True):
  186. # Multiset-label determination.
  187. multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
  188. # sorting each multiset
  189. multiset.sort()
  190. multiset = [attrs[node_label]] + multiset # add the prefix
  191. all_multisets.append(tuple(multiset))
  192. # label compression
  193. set_unique = list(set(all_multisets)) # set of unique multiset labels
  194. # a dictionary mapping original labels to new ones.
  195. set_compressed = {}
  196. # if a label occured before, assign its former compressed label,
  197. # else assign the number of labels occured + 1 as the compressed label.
  198. for value in set_unique:
  199. if value in all_set_compressed.keys():
  200. set_compressed.update({value: all_set_compressed[value]})
  201. else:
  202. set_compressed.update({value: str(num_of_labels_occured + 1)})
  203. num_of_labels_occured += 1
  204. all_set_compressed.update(set_compressed)
  205. # relabel nodes
  206. for idx, node in enumerate(G.nodes()):
  207. G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
  208. # get the set of compressed labels
  209. labels_comp = list(nx.get_node_attributes(G, node_label).values())
  210. # all_labels_ori.update(labels_comp)
  211. all_num_of_each_label.append(dict(Counter(labels_comp)))
  212. # calculate subtree kernel with h iterations and add it to the final kernel
  213. compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, False)
  214. return Kmatrix
  215. def wl_iteration(G, node_label):
  216. all_multisets = []
  217. for node, attrs in G.nodes(data=True):
  218. # Multiset-label determination.
  219. multiset = [G.nodes[neighbors][node_label] for neighbors in G[node]]
  220. # sorting each multiset
  221. multiset.sort()
  222. multiset = [attrs[node_label]] + multiset # add the prefix
  223. all_multisets.append(tuple(multiset))
  224. # # label compression
  225. # set_unique = list(set(all_multisets)) # set of unique multiset labels
  226. return all_multisets
  227. # # a dictionary mapping original labels to new ones.
  228. # set_compressed = {}
  229. # # if a label occured before, assign its former compressed label,
  230. # # else assign the number of labels occured + 1 as the compressed label.
  231. # for value in set_unique:
  232. # if value in all_set_compressed.keys():
  233. # set_compressed.update({value: all_set_compressed[value]})
  234. # else:
  235. # set_compressed.update({value: str(num_of_labels_occured + 1)})
  236. # num_of_labels_occured += 1
  237. #
  238. # all_set_compressed.update(set_compressed)
  239. #
  240. # # relabel nodes
  241. # for idx, node in enumerate(G.nodes()):
  242. # G.nodes[node][node_label] = set_compressed[all_multisets[idx]]
  243. #
  244. # # get the set of compressed labels
  245. # labels_comp = list(nx.get_node_attributes(G, node_label).values())
  246. # all_labels_ori.update(labels_comp)
  247. # all_num_of_each_label.append(dict(Counter(labels_comp)))
  248. # return
  249. def wrapper_wl_iteration(node_label, itr_item):
  250. g = itr_item[0]
  251. i = itr_item[1]
  252. all_multisets = wl_iteration(g, node_label)
  253. return i, all_multisets
  254. def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose):
  255. """Compute kernel matrix using the base kernel.
  256. """
  257. if parallel == 'imap_unordered':
  258. # compute kernels.
  259. def init_worker(alllabels_toshare):
  260. global G_alllabels
  261. G_alllabels = alllabels_toshare
  262. do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
  263. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  264. glbv=(all_num_of_each_label,), n_jobs=n_jobs, verbose=verbose)
  265. elif parallel == None:
  266. for i in range(len(Kmatrix)):
  267. for j in range(i, len(Kmatrix)):
  268. Kmatrix[i][j] = compute_subtree_kernel(all_num_of_each_label[i],
  269. all_num_of_each_label[j], Kmatrix[i][j])
  270. Kmatrix[j][i] = Kmatrix[i][j]
  271. def compute_subtree_kernel(num_of_each_label1, num_of_each_label2, kernel):
  272. """Compute the subtree kernel.
  273. """
  274. labels = set(list(num_of_each_label1.keys()) + list(num_of_each_label2.keys()))
  275. vector1 = np.array([(num_of_each_label1[label]
  276. if (label in num_of_each_label1.keys()) else 0)
  277. for label in labels])
  278. vector2 = np.array([(num_of_each_label2[label]
  279. if (label in num_of_each_label2.keys()) else 0)
  280. for label in labels])
  281. kernel += np.dot(vector1, vector2)
  282. return kernel
  283. def wrapper_compute_subtree_kernel(Kmatrix, itr):
  284. i = itr[0]
  285. j = itr[1]
  286. return i, j, compute_subtree_kernel(G_alllabels[i], G_alllabels[j], Kmatrix[i][j])
  287. def _wl_spkernel_do(Gn, node_label, edge_label, height):
  288. """Calculate Weisfeiler-Lehman shortest path kernels between graphs.
  289. Parameters
  290. ----------
  291. Gn : List of NetworkX graph
  292. List of graphs between which the kernels are calculated.
  293. node_label : string
  294. node attribute used as label.
  295. edge_label : string
  296. edge attribute used as label.
  297. height : int
  298. subtree height.
  299. Return
  300. ------
  301. Kmatrix : Numpy matrix
  302. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  303. """
  304. pass
  305. from gklearn.utils.utils import getSPGraph
  306. # init.
  307. height = int(height)
  308. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  309. Gn = [ getSPGraph(G, edge_weight = edge_label) for G in Gn ] # get shortest path graphs of Gn
  310. # initial for height = 0
  311. for i in range(0, len(Gn)):
  312. for j in range(i, len(Gn)):
  313. for e1 in Gn[i].edges(data = True):
  314. for e2 in Gn[j].edges(data = True):
  315. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  316. Kmatrix[i][j] += 1
  317. Kmatrix[j][i] = Kmatrix[i][j]
  318. # iterate each height
  319. for h in range(1, height + 1):
  320. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  321. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  322. for G in Gn: # for each graph
  323. set_multisets = []
  324. for node in G.nodes(data = True):
  325. # Multiset-label determination.
  326. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  327. # sorting each multiset
  328. multiset.sort()
  329. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  330. set_multisets.append(multiset)
  331. # label compression
  332. set_unique = list(set(set_multisets)) # set of unique multiset labels
  333. # a dictionary mapping original labels to new ones.
  334. set_compressed = {}
  335. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  336. for value in set_unique:
  337. if value in all_set_compressed.keys():
  338. set_compressed.update({ value : all_set_compressed[value] })
  339. else:
  340. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  341. num_of_labels_occured += 1
  342. all_set_compressed.update(set_compressed)
  343. # relabel nodes
  344. for node in G.nodes(data = True):
  345. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  346. # calculate subtree kernel with h iterations and add it to the final kernel
  347. for i in range(0, len(Gn)):
  348. for j in range(i, len(Gn)):
  349. for e1 in Gn[i].edges(data = True):
  350. for e2 in Gn[j].edges(data = True):
  351. if e1[2]['cost'] != 0 and e1[2]['cost'] == e2[2]['cost'] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  352. Kmatrix[i][j] += 1
  353. Kmatrix[j][i] = Kmatrix[i][j]
  354. return Kmatrix
  355. def _wl_edgekernel_do(Gn, node_label, edge_label, height):
  356. """Calculate Weisfeiler-Lehman edge kernels between graphs.
  357. Parameters
  358. ----------
  359. Gn : List of NetworkX graph
  360. List of graphs between which the kernels are calculated.
  361. node_label : string
  362. node attribute used as label.
  363. edge_label : string
  364. edge attribute used as label.
  365. height : int
  366. subtree height.
  367. Return
  368. ------
  369. Kmatrix : Numpy matrix
  370. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  371. """
  372. pass
  373. # init.
  374. height = int(height)
  375. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  376. # initial for height = 0
  377. for i in range(0, len(Gn)):
  378. for j in range(i, len(Gn)):
  379. for e1 in Gn[i].edges(data = True):
  380. for e2 in Gn[j].edges(data = True):
  381. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  382. Kmatrix[i][j] += 1
  383. Kmatrix[j][i] = Kmatrix[i][j]
  384. # iterate each height
  385. for h in range(1, height + 1):
  386. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  387. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  388. for G in Gn: # for each graph
  389. set_multisets = []
  390. for node in G.nodes(data = True):
  391. # Multiset-label determination.
  392. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  393. # sorting each multiset
  394. multiset.sort()
  395. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  396. set_multisets.append(multiset)
  397. # label compression
  398. set_unique = list(set(set_multisets)) # set of unique multiset labels
  399. # a dictionary mapping original labels to new ones.
  400. set_compressed = {}
  401. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  402. for value in set_unique:
  403. if value in all_set_compressed.keys():
  404. set_compressed.update({ value : all_set_compressed[value] })
  405. else:
  406. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  407. num_of_labels_occured += 1
  408. all_set_compressed.update(set_compressed)
  409. # relabel nodes
  410. for node in G.nodes(data = True):
  411. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  412. # calculate subtree kernel with h iterations and add it to the final kernel
  413. for i in range(0, len(Gn)):
  414. for j in range(i, len(Gn)):
  415. for e1 in Gn[i].edges(data = True):
  416. for e2 in Gn[j].edges(data = True):
  417. if e1[2][edge_label] == e2[2][edge_label] and ((e1[0] == e2[0] and e1[1] == e2[1]) or (e1[0] == e2[1] and e1[1] == e2[0])):
  418. Kmatrix[i][j] += 1
  419. Kmatrix[j][i] = Kmatrix[i][j]
  420. return Kmatrix
  421. def _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel):
  422. """Calculate Weisfeiler-Lehman kernels based on user-defined kernel between graphs.
  423. Parameters
  424. ----------
  425. Gn : List of NetworkX graph
  426. List of graphs between which the kernels are calculated.
  427. node_label : string
  428. node attribute used as label.
  429. edge_label : string
  430. edge attribute used as label.
  431. height : int
  432. subtree height.
  433. base_kernel : string
  434. Name of the base kernel function used in each iteration of WL kernel. This function returns a Numpy matrix, each element of which is the user-defined Weisfeiler-Lehman kernel between 2 praphs.
  435. Return
  436. ------
  437. Kmatrix : Numpy matrix
  438. Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs.
  439. """
  440. pass
  441. # init.
  442. height = int(height)
  443. Kmatrix = np.zeros((len(Gn), len(Gn))) # init kernel
  444. # initial for height = 0
  445. Kmatrix = base_kernel(Gn, node_label, edge_label)
  446. # iterate each height
  447. for h in range(1, height + 1):
  448. all_set_compressed = {} # a dictionary mapping original labels to new ones in all graphs in this iteration
  449. num_of_labels_occured = 0 # number of the set of letters that occur before as node labels at least once in all graphs
  450. for G in Gn: # for each graph
  451. set_multisets = []
  452. for node in G.nodes(data = True):
  453. # Multiset-label determination.
  454. multiset = [ G.node[neighbors][node_label] for neighbors in G[node[0]] ]
  455. # sorting each multiset
  456. multiset.sort()
  457. multiset = node[1][node_label] + ''.join(multiset) # concatenate to a string and add the prefix
  458. set_multisets.append(multiset)
  459. # label compression
  460. set_unique = list(set(set_multisets)) # set of unique multiset labels
  461. # a dictionary mapping original labels to new ones.
  462. set_compressed = {}
  463. # if a label occured before, assign its former compressed label, else assign the number of labels occured + 1 as the compressed label
  464. for value in set_unique:
  465. if value in all_set_compressed.keys():
  466. set_compressed.update({ value : all_set_compressed[value] })
  467. else:
  468. set_compressed.update({ value : str(num_of_labels_occured + 1) })
  469. num_of_labels_occured += 1
  470. all_set_compressed.update(set_compressed)
  471. # relabel nodes
  472. for node in G.nodes(data = True):
  473. node[1][node_label] = set_compressed[set_multisets[node[0]]]
  474. # calculate kernel with h iterations and add it to the final kernel
  475. Kmatrix += base_kernel(Gn, node_label, edge_label)
  476. return Kmatrix

A Python package for graph kernels, graph edit distances and graph pre-image problem.