You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iam.py 35 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Apr 26 11:49:12 2019
  5. Iterative alternate minimizations using GED.
  6. @author: ljia
  7. """
  8. import numpy as np
  9. import random
  10. import networkx as nx
  11. from tqdm import tqdm
  12. import sys
  13. #from Cython_GedLib_2 import librariesImport, script
  14. import librariesImport, script
  15. sys.path.insert(0, "../")
  16. from pygraph.utils.graphfiles import saveDataset
  17. from pygraph.utils.graphdataset import get_dataset_attributes
  18. from pygraph.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels
  19. #from pygraph.utils.utils import graph_deepcopy
  20. def iam_moreGraphsAsInit_tryAllPossibleBestGraphs(Gn_median, Gn_candidate,
  21. c_ei=3, c_er=3, c_es=1, ite_max=50, epsilon=0.001,
  22. node_label='atom', edge_label='bond_type',
  23. connected=False, removeNodes=True, AllBestInit=True,
  24. params_ged={'ged_cost': 'CHEM_1', 'ged_method': 'IPFP', 'saveGXL': 'benoit'}):
  25. """See my name, then you know what I do.
  26. """
  27. from tqdm import tqdm
  28. # Gn_median = Gn_median[0:10]
  29. # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
  30. if removeNodes:
  31. node_ir = np.inf # corresponding to the node remove and insertion.
  32. label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
  33. ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
  34. attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
  35. edge_label=edge_label)
  36. def generate_graph(G, pi_p_forward, label_set):
  37. G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
  38. # nx.draw_networkx(G)
  39. # import matplotlib.pyplot as plt
  40. # plt.show()
  41. # print(pi_p_forward)
  42. # update vertex labels.
  43. # pre-compute h_i0 for each label.
  44. # for label in get_node_labels(Gn, node_label):
  45. # print(label)
  46. # for nd in G.nodes(data=True):
  47. # pass
  48. if not ds_attrs['node_attr_dim']: # labels are symbolic
  49. for ndi, (nd, _) in enumerate(G.nodes(data=True)):
  50. h_i0_list = []
  51. label_list = []
  52. for label in label_set:
  53. h_i0 = 0
  54. for idx, g in enumerate(Gn_median):
  55. pi_i = pi_p_forward[idx][ndi]
  56. if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
  57. h_i0 += 1
  58. h_i0_list.append(h_i0)
  59. label_list.append(label)
  60. # case when the node is to be removed.
  61. if removeNodes:
  62. h_i0_remove = 0 # @todo: maybe this can be added to the label_set above.
  63. for idx, g in enumerate(Gn_median):
  64. pi_i = pi_p_forward[idx][ndi]
  65. if pi_i == node_ir:
  66. h_i0_remove += 1
  67. h_i0_list.append(h_i0_remove)
  68. label_list.append(label_r)
  69. # get the best labels.
  70. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
  71. nlabel_best = [label_list[idx] for idx in idx_max]
  72. # generate "best" graphs with regard to "best" node labels.
  73. G_new_list_nd = []
  74. for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
  75. for nl in nlabel_best:
  76. g_tmp = g.copy()
  77. if nl == label_r:
  78. g_tmp.remove_node(nd)
  79. else:
  80. g_tmp.nodes[nd][node_label] = nl
  81. G_new_list_nd.append(g_tmp)
  82. # nx.draw_networkx(g_tmp)
  83. # import matplotlib.pyplot as plt
  84. # plt.show()
  85. # print(g_tmp.nodes(data=True))
  86. # print(g_tmp.edges(data=True))
  87. G_new_list = [ggg.copy() for ggg in G_new_list_nd]
  88. else: # labels are non-symbolic
  89. for ndi, (nd, _) in enumerate(G.nodes(data=True)):
  90. Si_norm = 0
  91. phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
  92. for idx, g in enumerate(Gn_median):
  93. pi_i = pi_p_forward[idx][ndi]
  94. if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
  95. Si_norm += 1
  96. phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
  97. phi_i_bar /= Si_norm
  98. G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
  99. # for g in G_new_list:
  100. # import matplotlib.pyplot as plt
  101. # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
  102. # plt.show()
  103. # print(g.nodes(data=True))
  104. # print(g.edges(data=True))
  105. # update edge labels and adjacency matrix.
  106. if ds_attrs['edge_labeled']:
  107. G_new_list_edge = []
  108. for g_new in G_new_list:
  109. nd_list = [n for n in g_new.nodes()]
  110. g_tmp_list = [g_new.copy()]
  111. for nd1i in range(nx.number_of_nodes(g_new)):
  112. nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes
  113. for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
  114. nd2 = nd_list[nd2i]
  115. # for nd1, nd2, _ in g_new.edges(data=True):
  116. h_ij0_list = []
  117. label_list = []
  118. # @todo: compute edge label set before.
  119. for label in get_edge_labels(Gn_median, edge_label):
  120. h_ij0 = 0
  121. for idx, g in enumerate(Gn_median):
  122. pi_i = pi_p_forward[idx][nd1i]
  123. pi_j = pi_p_forward[idx][nd2i]
  124. h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
  125. g.has_edge(pi_i, pi_j) and
  126. g.edges[pi_i, pi_j][edge_label] == label)
  127. h_ij0 += h_ij0_p
  128. h_ij0_list.append(h_ij0)
  129. label_list.append(label)
  130. # # case when the edge is to be removed.
  131. # h_ij0_remove = 0
  132. # for idx, g in enumerate(Gn_median):
  133. # pi_i = pi_p_forward[idx][nd1i]
  134. # pi_j = pi_p_forward[idx][nd2i]
  135. # if g.has_node(pi_i) and g.has_node(pi_j) and not
  136. # g.has_edge(pi_i, pi_j):
  137. # h_ij0_remove += 1
  138. # h_ij0_list.append(h_ij0_remove)
  139. # label_list.append(label_r)
  140. # get the best labels.
  141. # choose all best graphs.
  142. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
  143. elabel_best = [label_list[idx] for idx in idx_max]
  144. h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
  145. # generate "best" graphs with regard to "best" node labels.
  146. G_new_list_ed = []
  147. for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
  148. for idxl, el in enumerate(elabel_best):
  149. g_tmp_copy = g_tmp.copy()
  150. # check whether a_ij is 0 or 1.
  151. sij_norm = 0
  152. for idx, g in enumerate(Gn_median):
  153. pi_i = pi_p_forward[idx][nd1i]
  154. pi_j = pi_p_forward[idx][nd2i]
  155. if g.has_node(pi_i) and g.has_node(pi_j) and \
  156. g.has_edge(pi_i, pi_j):
  157. sij_norm += 1
  158. if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
  159. sij_norm * (1 - (c_er + c_ei) / c_es):
  160. if not g_tmp_copy.has_edge(nd1, nd2):
  161. g_tmp_copy.add_edge(nd1, nd2)
  162. g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl]
  163. else:
  164. if g_tmp_copy.has_edge(nd1, nd2):
  165. g_tmp_copy.remove_edge(nd1, nd2)
  166. G_new_list_ed.append(g_tmp_copy)
  167. g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
  168. G_new_list_edge += g_tmp_list
  169. G_new_list = [ggg.copy() for ggg in G_new_list_edge]
  170. # # choose one of the best randomly.
  171. # idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
  172. # h_ij0_max = h_ij0_list[idx_max[0]]
  173. # idx_rdm = random.randint(0, len(idx_max) - 1)
  174. # best_label = label_list[idx_max[idx_rdm]]
  175. #
  176. # # check whether a_ij is 0 or 1.
  177. # sij_norm = 0
  178. # for idx, g in enumerate(Gn_median):
  179. # pi_i = pi_p_forward[idx][nd1i]
  180. # pi_j = pi_p_forward[idx][nd2i]
  181. # if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  182. # sij_norm += 1
  183. # if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
  184. # if not g_new.has_edge(nd1, nd2):
  185. # g_new.add_edge(nd1, nd2)
  186. # g_new.edges[nd1, nd2][edge_label] = best_label
  187. # else:
  188. # if g_new.has_edge(nd1, nd2):
  189. # g_new.remove_edge(nd1, nd2)
  190. else: # if edges are unlabeled
  191. # @todo: is this even right? G or g_tmp? check if the new one is right
  192. # @todo: works only for undirected graphs.
  193. for g_tmp in G_new_list:
  194. nd_list = [n for n in g_tmp.nodes()]
  195. for nd1i in range(nx.number_of_nodes(g_tmp)):
  196. nd1 = nd_list[nd1i]
  197. for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
  198. nd2 = nd_list[nd2i]
  199. sij_norm = 0
  200. for idx, g in enumerate(Gn_median):
  201. pi_i = pi_p_forward[idx][nd1i]
  202. pi_j = pi_p_forward[idx][nd2i]
  203. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  204. sij_norm += 1
  205. if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
  206. # @todo: should we consider if nd1 and nd2 in g_tmp?
  207. # or just add the edge anyway?
  208. if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
  209. and not g_tmp.has_edge(nd1, nd2):
  210. g_tmp.add_edge(nd1, nd2)
  211. # else: # @todo: which to use?
  212. elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
  213. if g_tmp.has_edge(nd1, nd2):
  214. g_tmp.remove_edge(nd1, nd2)
  215. # do not change anything when equal.
  216. # for i, g in enumerate(G_new_list):
  217. # import matplotlib.pyplot as plt
  218. # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
  219. ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
  220. # plt.show()
  221. # print(g.nodes(data=True))
  222. # print(g.edges(data=True))
  223. # # find the best graph generated in this iteration and update pi_p.
  224. # @todo: should we update all graphs generated or just the best ones?
  225. dis_list, pi_forward_list = median_distance(G_new_list, Gn_median,
  226. **params_ged)
  227. # @todo: should we remove the identical and connectivity check?
  228. # Don't know which is faster.
  229. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
  230. G_new_list, idx_list = remove_duplicates(G_new_list)
  231. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  232. dis_list = [dis_list[idx] for idx in idx_list]
  233. # if connected == True:
  234. # G_new_list, idx_list = remove_disconnected(G_new_list)
  235. # pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  236. # idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
  237. # dis_min = dis_list[idx_min_tmp_list[0]]
  238. # pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
  239. # G_new_list = [G_new_list[idx] for idx in idx_min_list]
  240. # for g in G_new_list:
  241. # import matplotlib.pyplot as plt
  242. # nx.draw_networkx(g)
  243. # plt.show()
  244. # print(g.nodes(data=True))
  245. # print(g.edges(data=True))
  246. return G_new_list, pi_forward_list, dis_list
  247. def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
  248. idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
  249. dis_min = dis_all[idx_min_list[0]]
  250. pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
  251. G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
  252. return G_min_list, pi_forward_min_list, dis_min
  253. def iteration_proc(G, pi_p_forward, cur_sod):
  254. G_list = [G]
  255. pi_forward_list = [pi_p_forward]
  256. old_sod = cur_sod * 2
  257. sod_list = [cur_sod]
  258. dis_list = [cur_sod]
  259. # iterations.
  260. itr = 0
  261. # @todo: what if difference == 0?
  262. # while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
  263. # np.abs(old_sod - cur_sod) == 0):
  264. while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
  265. # for itr in range(0, 5): # the convergence condition?
  266. print('itr_iam is', itr)
  267. G_new_list = []
  268. pi_forward_new_list = []
  269. dis_new_list = []
  270. for idx, g in enumerate(G_list):
  271. label_set = get_node_labels(Gn_median + [g], node_label)
  272. G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
  273. g, pi_forward_list[idx], label_set)
  274. G_new_list += G_tmp_list
  275. pi_forward_new_list += pi_forward_tmp_list
  276. dis_new_list += dis_tmp_list
  277. # @todo: need to remove duplicates here?
  278. G_list = [ggg.copy() for ggg in G_new_list]
  279. pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
  280. dis_list = dis_new_list[:]
  281. old_sod = cur_sod
  282. cur_sod = np.min(dis_list)
  283. sod_list.append(cur_sod)
  284. itr += 1
  285. # @todo: do we return all graphs or the best ones?
  286. # get the best ones of the generated graphs.
  287. G_list, pi_forward_list, dis_min = best_median_graphs(
  288. G_list, pi_forward_list, dis_list)
  289. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
  290. G_list, idx_list = remove_duplicates(G_list)
  291. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  292. # dis_list = [dis_list[idx] for idx in idx_list]
  293. # import matplotlib.pyplot as plt
  294. # for g in G_list:
  295. # nx.draw_networkx(g)
  296. # plt.show()
  297. # print(g.nodes(data=True))
  298. # print(g.edges(data=True))
  299. print('\nsods:', sod_list, '\n')
  300. return G_list, pi_forward_list, dis_min
  301. def remove_duplicates(Gn):
  302. """Remove duplicate graphs from list.
  303. """
  304. Gn_new = []
  305. idx_list = []
  306. for idx, g in enumerate(Gn):
  307. dupl = False
  308. for g_new in Gn_new:
  309. if graph_isIdentical(g_new, g):
  310. dupl = True
  311. break
  312. if not dupl:
  313. Gn_new.append(g)
  314. idx_list.append(idx)
  315. return Gn_new, idx_list
  316. def remove_disconnected(Gn):
  317. """Remove disconnected graphs from list.
  318. """
  319. Gn_new = []
  320. idx_list = []
  321. for idx, g in enumerate(Gn):
  322. if nx.is_connected(g):
  323. Gn_new.append(g)
  324. idx_list.append(idx)
  325. return Gn_new, idx_list
  326. # phase 1: initilize.
  327. # compute set-median.
  328. dis_min = np.inf
  329. dis_list, pi_forward_all = median_distance(Gn_candidate, Gn_median,
  330. **params_ged)
  331. # find all smallest distances.
  332. if AllBestInit: # try all best init graphs.
  333. idx_min_list = range(len(dis_list))
  334. dis_min = dis_list
  335. else:
  336. idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
  337. dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
  338. # phase 2: iteration.
  339. G_list = []
  340. dis_list = []
  341. pi_forward_list = []
  342. for idx_tmp, idx_min in enumerate(idx_min_list):
  343. # print('idx_min is', idx_min)
  344. G = Gn_candidate[idx_min].copy()
  345. # list of edit operations.
  346. pi_p_forward = pi_forward_all[idx_min]
  347. # pi_p_backward = pi_all_backward[idx_min]
  348. Gi_list, pi_i_forward_list, dis_i_min = iteration_proc(G, pi_p_forward, dis_min[idx_tmp])
  349. G_list += Gi_list
  350. dis_list += [dis_i_min] * len(Gi_list)
  351. pi_forward_list += pi_i_forward_list
  352. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
  353. G_list, idx_list = remove_duplicates(G_list)
  354. dis_list = [dis_list[idx] for idx in idx_list]
  355. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  356. if connected == True:
  357. G_list_con, idx_list = remove_disconnected(G_list)
  358. # if there is no connected graphs at all, then remain the disconnected ones.
  359. if len(G_list_con) > 0: # @todo: ??????????????????????????
  360. G_list = G_list_con
  361. dis_list = [dis_list[idx] for idx in idx_list]
  362. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  363. # import matplotlib.pyplot as plt
  364. # for g in G_list:
  365. # nx.draw_networkx(g)
  366. # plt.show()
  367. # print(g.nodes(data=True))
  368. # print(g.edges(data=True))
  369. # get the best median graphs
  370. # dis_list, pi_forward_list = median_distance(G_list, Gn_median,
  371. # **params_ged)
  372. G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
  373. G_list, pi_forward_list, dis_list)
  374. # for g in G_min_list:
  375. # nx.draw_networkx(g)
  376. # plt.show()
  377. # print(g.nodes(data=True))
  378. # print(g.edges(data=True))
  379. # randomly choose one graph.
  380. idx_rdm = random.randint(0, len(G_min_list) - 1)
  381. G_min_list = [G_min_list[idx_rdm]]
  382. return G_min_list, dis_min
  383. ###############################################################################
  384. def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
  385. connected=True):
  386. """See my name, then you know what I do.
  387. """
  388. # Gn = Gn[0:10]
  389. Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
  390. # phase 1: initilize.
  391. # compute set-median.
  392. dis_min = np.inf
  393. pi_p = []
  394. pi_all = []
  395. for idx1, G_p in enumerate(Gn):
  396. dist_sum = 0
  397. pi_all.append([])
  398. for idx2, G_p_prime in enumerate(Gn):
  399. dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
  400. pi_all[idx1].append(pi_tmp)
  401. dist_sum += dist_tmp
  402. if dist_sum < dis_min:
  403. dis_min = dist_sum
  404. G = G_p.copy()
  405. idx_min = idx1
  406. # list of edit operations.
  407. pi_p = pi_all[idx_min]
  408. # phase 2: iteration.
  409. ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'],
  410. edge_label=edge_label)
  411. for itr in range(0, 10): # @todo: the convergence condition?
  412. G_new = G.copy()
  413. # update vertex labels.
  414. # pre-compute h_i0 for each label.
  415. # for label in get_node_labels(Gn, node_label):
  416. # print(label)
  417. # for nd in G.nodes(data=True):
  418. # pass
  419. if not ds_attrs['node_attr_dim']: # labels are symbolic
  420. for nd, _ in G.nodes(data=True):
  421. h_i0_list = []
  422. label_list = []
  423. for label in get_node_labels(Gn, node_label):
  424. h_i0 = 0
  425. for idx, g in enumerate(Gn):
  426. pi_i = pi_p[idx][nd]
  427. if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
  428. h_i0 += 1
  429. h_i0_list.append(h_i0)
  430. label_list.append(label)
  431. # choose one of the best randomly.
  432. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
  433. idx_rdm = random.randint(0, len(idx_max) - 1)
  434. G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
  435. else: # labels are non-symbolic
  436. for nd, _ in G.nodes(data=True):
  437. Si_norm = 0
  438. phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
  439. for idx, g in enumerate(Gn):
  440. pi_i = pi_p[idx][nd]
  441. if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
  442. Si_norm += 1
  443. phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
  444. phi_i_bar /= Si_norm
  445. G_new.nodes[nd]['attributes'] = phi_i_bar
  446. # update edge labels and adjacency matrix.
  447. if ds_attrs['edge_labeled']:
  448. for nd1, nd2, _ in G.edges(data=True):
  449. h_ij0_list = []
  450. label_list = []
  451. for label in get_edge_labels(Gn, edge_label):
  452. h_ij0 = 0
  453. for idx, g in enumerate(Gn):
  454. pi_i = pi_p[idx][nd1]
  455. pi_j = pi_p[idx][nd2]
  456. h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
  457. g.has_edge(pi_i, pi_j) and
  458. g.edges[pi_i, pi_j][edge_label] == label)
  459. h_ij0 += h_ij0_p
  460. h_ij0_list.append(h_ij0)
  461. label_list.append(label)
  462. # choose one of the best randomly.
  463. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
  464. h_ij0_max = h_ij0_list[idx_max[0]]
  465. idx_rdm = random.randint(0, len(idx_max) - 1)
  466. best_label = label_list[idx_max[idx_rdm]]
  467. # check whether a_ij is 0 or 1.
  468. sij_norm = 0
  469. for idx, g in enumerate(Gn):
  470. pi_i = pi_p[idx][nd1]
  471. pi_j = pi_p[idx][nd2]
  472. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  473. sij_norm += 1
  474. if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
  475. if not G_new.has_edge(nd1, nd2):
  476. G_new.add_edge(nd1, nd2)
  477. G_new.edges[nd1, nd2][edge_label] = best_label
  478. else:
  479. if G_new.has_edge(nd1, nd2):
  480. G_new.remove_edge(nd1, nd2)
  481. else: # if edges are unlabeled
  482. for nd1, nd2, _ in G.edges(data=True):
  483. sij_norm = 0
  484. for idx, g in enumerate(Gn):
  485. pi_i = pi_p[idx][nd1]
  486. pi_j = pi_p[idx][nd2]
  487. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  488. sij_norm += 1
  489. if sij_norm > len(Gn) * c_er / (c_er + c_ei):
  490. if not G_new.has_edge(nd1, nd2):
  491. G_new.add_edge(nd1, nd2)
  492. else:
  493. if G_new.has_edge(nd1, nd2):
  494. G_new.remove_edge(nd1, nd2)
  495. G = G_new.copy()
  496. # update pi_p
  497. pi_p = []
  498. for idx1, G_p in enumerate(Gn):
  499. dist_tmp, pi_tmp, _ = GED(G, G_p)
  500. pi_p.append(pi_tmp)
  501. return G
  502. def GED(g1, g2, lib='gedlib', cost='CHEM_1', method='IPFP', saveGXL='benoit',
  503. stabilizer='min'):
  504. """
  505. Compute GED.
  506. """
  507. if lib == 'gedlib':
  508. # transform dataset to the 'xml' file as the GedLib required.
  509. saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp',
  510. xparams={'method': saveGXL})
  511. # script.appel()
  512. script.PyRestartEnv()
  513. script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml')
  514. listID = script.PyGetGraphIds()
  515. script.PySetEditCost(cost) #("CHEM_1")
  516. script.PyInitEnv()
  517. script.PySetMethod(method, "")
  518. script.PyInitMethod()
  519. g = listID[0]
  520. h = listID[1]
  521. if stabilizer == None:
  522. script.PyRunMethod(g, h)
  523. pi_forward, pi_backward = script.PyGetAllMap(g, h)
  524. upper = script.PyGetUpperBound(g, h)
  525. lower = script.PyGetLowerBound(g, h)
  526. elif stabilizer == 'min':
  527. upper = np.inf
  528. for itr in range(50):
  529. script.PyRunMethod(g, h)
  530. upper_tmp = script.PyGetUpperBound(g, h)
  531. if upper_tmp < upper:
  532. upper = upper_tmp
  533. pi_forward, pi_backward = script.PyGetAllMap(g, h)
  534. lower = script.PyGetLowerBound(g, h)
  535. if upper == 0:
  536. break
  537. dis = upper
  538. # make the map label correct (label remove map as np.inf)
  539. nodes1 = [n for n in g1.nodes()]
  540. nodes2 = [n for n in g2.nodes()]
  541. nb1 = nx.number_of_nodes(g1)
  542. nb2 = nx.number_of_nodes(g2)
  543. pi_forward = [nodes2[pi] if pi < nb2 else np.inf for pi in pi_forward]
  544. pi_backward = [nodes1[pi] if pi < nb1 else np.inf for pi in pi_backward]
  545. return dis, pi_forward, pi_backward
  546. def median_distance(Gn, Gn_median, measure='ged', verbose=False,
  547. ged_cost='CHEM_1', ged_method='IPFP', saveGXL='benoit'):
  548. dis_list = []
  549. pi_forward_list = []
  550. for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
  551. file=sys.stdout) if verbose else enumerate(Gn):
  552. dis_sum = 0
  553. pi_forward_list.append([])
  554. for G_p in Gn_median:
  555. dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p,
  556. cost=ged_cost, method=ged_method, saveGXL=saveGXL)
  557. pi_forward_list[idx].append(pi_tmp_forward)
  558. dis_sum += dis_tmp
  559. dis_list.append(dis_sum)
  560. return dis_list, pi_forward_list
  561. # --------------------------- These are tests --------------------------------#
  562. def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1,
  563. node_label='atom', edge_label='bond_type'):
  564. """See my name, then you know what I do.
  565. """
  566. # Gn = Gn[0:10]
  567. Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
  568. # phase 1: initilize.
  569. # compute set-median.
  570. dis_min = np.inf
  571. # pi_p = []
  572. pi_all_forward = []
  573. pi_all_backward = []
  574. for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout):
  575. dist_sum = 0
  576. pi_all_forward.append([])
  577. pi_all_backward.append([])
  578. for idx2, G_p_prime in enumerate(Gn):
  579. dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime)
  580. pi_all_forward[idx1].append(pi_tmp_forward)
  581. pi_all_backward[idx1].append(pi_tmp_backward)
  582. dist_sum += dist_tmp
  583. if dist_sum <= dis_min:
  584. dis_min = dist_sum
  585. G = G_p.copy()
  586. idx_min = idx1
  587. # list of edit operations.
  588. pi_p_forward = pi_all_forward[idx_min]
  589. pi_p_backward = pi_all_backward[idx_min]
  590. # phase 2: iteration.
  591. ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
  592. edge_label=edge_label)
  593. label_set = get_node_labels(Gn + [G], node_label)
  594. for itr in range(0, 10): # @todo: the convergence condition?
  595. G_new = G.copy()
  596. # update vertex labels.
  597. # pre-compute h_i0 for each label.
  598. # for label in get_node_labels(Gn, node_label):
  599. # print(label)
  600. # for nd in G.nodes(data=True):
  601. # pass
  602. if not ds_attrs['node_attr_dim']: # labels are symbolic
  603. for nd in G.nodes():
  604. h_i0_list = []
  605. label_list = []
  606. for label in label_set:
  607. h_i0 = 0
  608. for idx, g in enumerate(Gn):
  609. pi_i = pi_p_forward[idx][nd]
  610. if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
  611. h_i0 += 1
  612. h_i0_list.append(h_i0)
  613. label_list.append(label)
  614. # choose one of the best randomly.
  615. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
  616. idx_rdm = random.randint(0, len(idx_max) - 1)
  617. G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
  618. else: # labels are non-symbolic
  619. for nd in G.nodes():
  620. Si_norm = 0
  621. phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
  622. for idx, g in enumerate(Gn):
  623. pi_i = pi_p_forward[idx][nd]
  624. if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
  625. Si_norm += 1
  626. phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
  627. phi_i_bar /= Si_norm
  628. G_new.nodes[nd]['attributes'] = phi_i_bar
  629. # update edge labels and adjacency matrix.
  630. if ds_attrs['edge_labeled']:
  631. for nd1, nd2, _ in G.edges(data=True):
  632. h_ij0_list = []
  633. label_list = []
  634. for label in get_edge_labels(Gn, edge_label):
  635. h_ij0 = 0
  636. for idx, g in enumerate(Gn):
  637. pi_i = pi_p_forward[idx][nd1]
  638. pi_j = pi_p_forward[idx][nd2]
  639. h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
  640. g.has_edge(pi_i, pi_j) and
  641. g.edges[pi_i, pi_j][edge_label] == label)
  642. h_ij0 += h_ij0_p
  643. h_ij0_list.append(h_ij0)
  644. label_list.append(label)
  645. # choose one of the best randomly.
  646. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
  647. h_ij0_max = h_ij0_list[idx_max[0]]
  648. idx_rdm = random.randint(0, len(idx_max) - 1)
  649. best_label = label_list[idx_max[idx_rdm]]
  650. # check whether a_ij is 0 or 1.
  651. sij_norm = 0
  652. for idx, g in enumerate(Gn):
  653. pi_i = pi_p_forward[idx][nd1]
  654. pi_j = pi_p_forward[idx][nd2]
  655. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  656. sij_norm += 1
  657. if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
  658. if not G_new.has_edge(nd1, nd2):
  659. G_new.add_edge(nd1, nd2)
  660. G_new.edges[nd1, nd2][edge_label] = best_label
  661. else:
  662. if G_new.has_edge(nd1, nd2):
  663. G_new.remove_edge(nd1, nd2)
  664. else: # if edges are unlabeled
  665. # @todo: works only for undirected graphs.
  666. for nd1 in range(nx.number_of_nodes(G)):
  667. for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
  668. sij_norm = 0
  669. for idx, g in enumerate(Gn):
  670. pi_i = pi_p_forward[idx][nd1]
  671. pi_j = pi_p_forward[idx][nd2]
  672. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  673. sij_norm += 1
  674. if sij_norm > len(Gn) * c_er / (c_er + c_ei):
  675. if not G_new.has_edge(nd1, nd2):
  676. G_new.add_edge(nd1, nd2)
  677. elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
  678. if G_new.has_edge(nd1, nd2):
  679. G_new.remove_edge(nd1, nd2)
  680. # do not change anything when equal.
  681. G = G_new.copy()
  682. # update pi_p
  683. pi_p_forward = []
  684. for G_p in Gn:
  685. dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
  686. pi_p_forward.append(pi_tmp_forward)
  687. return G
  688. ###############################################################################
  689. if __name__ == '__main__':
  690. from pygraph.utils.graphfiles import loadDataset
  691. ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
  692. 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
  693. # ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
  694. # 'extra_params': {}} # node nsymb
  695. # ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
  696. # 'extra_params': {}}
  697. Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  698. iam(Gn)

A Python package for graph kernels, graph edit distances and graph pre-image problem.