You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

iam.py 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri Apr 26 11:49:12 2019
  5. Iterative alternate minimizations using GED.
  6. @author: ljia
  7. """
  8. import numpy as np
  9. import random
  10. import networkx as nx
  11. from tqdm import tqdm
  12. import sys
  13. sys.path.insert(0, "../")
  14. from pygraph.utils.graphdataset import get_dataset_attributes
  15. from pygraph.utils.utils import graph_isIdentical, get_node_labels, get_edge_labels
  16. from ged import GED, ged_median
  17. def iam_upgraded(Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, ite_max=50,
  18. epsilon=0.001, node_label='atom', edge_label='bond_type',
  19. connected=False, removeNodes=True, allBestInit=False, allBestNodes=False,
  20. allBestEdges=False, allBestOutput=False,
  21. params_ged={'lib': 'gedlibpy', 'cost': 'CHEM_1', 'method': 'IPFP',
  22. 'edit_cost_constant': [], 'stabilizer': None,
  23. 'algo_options': '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'}):
  24. """See my name, then you know what I do.
  25. """
  26. # Gn_median = Gn_median[0:10]
  27. # Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
  28. node_ir = np.inf # corresponding to the node remove and insertion.
  29. label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
  30. ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
  31. attr_names=['edge_labeled', 'node_attr_dim', 'edge_attr_dim'],
  32. edge_label=edge_label)
  33. node_label_set = get_node_labels(Gn_median, node_label)
  34. edge_label_set = get_edge_labels(Gn_median, edge_label)
  35. def generate_graph(G, pi_p_forward):
  36. G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
  37. # nx.draw_networkx(G)
  38. # import matplotlib.pyplot as plt
  39. # plt.show()
  40. # print(pi_p_forward)
  41. # update vertex labels.
  42. # pre-compute h_i0 for each label.
  43. # for label in get_node_labels(Gn, node_label):
  44. # print(label)
  45. # for nd in G.nodes(data=True):
  46. # pass
  47. if not ds_attrs['node_attr_dim']: # labels are symbolic
  48. for ndi, (nd, _) in enumerate(G.nodes(data=True)):
  49. h_i0_list = []
  50. label_list = []
  51. for label in node_label_set:
  52. h_i0 = 0
  53. for idx, g in enumerate(Gn_median):
  54. pi_i = pi_p_forward[idx][ndi]
  55. if pi_i != node_ir and g.nodes[pi_i][node_label] == label:
  56. h_i0 += 1
  57. h_i0_list.append(h_i0)
  58. label_list.append(label)
  59. # case when the node is to be removed.
  60. if removeNodes:
  61. h_i0_remove = 0 # @todo: maybe this can be added to the node_label_set above.
  62. for idx, g in enumerate(Gn_median):
  63. pi_i = pi_p_forward[idx][ndi]
  64. if pi_i == node_ir:
  65. h_i0_remove += 1
  66. h_i0_list.append(h_i0_remove)
  67. label_list.append(label_r)
  68. # get the best labels.
  69. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
  70. if allBestNodes: # choose all best graphs.
  71. nlabel_best = [label_list[idx] for idx in idx_max]
  72. # generate "best" graphs with regard to "best" node labels.
  73. G_new_list_nd = []
  74. for g in G_new_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
  75. for nl in nlabel_best:
  76. g_tmp = g.copy()
  77. if nl == label_r:
  78. g_tmp.remove_node(nd)
  79. else:
  80. g_tmp.nodes[nd][node_label] = nl
  81. G_new_list_nd.append(g_tmp)
  82. # nx.draw_networkx(g_tmp)
  83. # import matplotlib.pyplot as plt
  84. # plt.show()
  85. # print(g_tmp.nodes(data=True))
  86. # print(g_tmp.edges(data=True))
  87. G_new_list = [ggg.copy() for ggg in G_new_list_nd]
  88. else:
  89. # choose one of the best randomly.
  90. idx_rdm = random.randint(0, len(idx_max) - 1)
  91. best_label = label_list[idx_max[idx_rdm]]
  92. h_i0_max = h_i0_list[idx_max[idx_rdm]]
  93. g_new = G_new_list[0]
  94. if best_label == label_r:
  95. g_new.remove_node(nd)
  96. else:
  97. g_new.nodes[nd][node_label] = best_label
  98. G_new_list = [g_new]
  99. else: # labels are non-symbolic
  100. for ndi, (nd, _) in enumerate(G.nodes(data=True)):
  101. Si_norm = 0
  102. phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
  103. for idx, g in enumerate(Gn_median):
  104. pi_i = pi_p_forward[idx][ndi]
  105. if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
  106. Si_norm += 1
  107. phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
  108. phi_i_bar /= Si_norm
  109. G_new_list[0].nodes[nd]['attributes'] = phi_i_bar
  110. # for g in G_new_list:
  111. # import matplotlib.pyplot as plt
  112. # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
  113. # plt.show()
  114. # print(g.nodes(data=True))
  115. # print(g.edges(data=True))
  116. # update edge labels and adjacency matrix.
  117. if ds_attrs['edge_labeled']:
  118. G_new_list_edge = []
  119. for g_new in G_new_list:
  120. nd_list = [n for n in g_new.nodes()]
  121. g_tmp_list = [g_new.copy()]
  122. for nd1i in range(nx.number_of_nodes(g_new)):
  123. nd1 = nd_list[nd1i]# @todo: not just edges, but all pairs of nodes
  124. for nd2i in range(nd1i + 1, nx.number_of_nodes(g_new)):
  125. nd2 = nd_list[nd2i]
  126. # for nd1, nd2, _ in g_new.edges(data=True):
  127. h_ij0_list = []
  128. label_list = []
  129. for label in edge_label_set:
  130. h_ij0 = 0
  131. for idx, g in enumerate(Gn_median):
  132. pi_i = pi_p_forward[idx][nd1i]
  133. pi_j = pi_p_forward[idx][nd2i]
  134. h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
  135. g.has_edge(pi_i, pi_j) and
  136. g.edges[pi_i, pi_j][edge_label] == label)
  137. h_ij0 += h_ij0_p
  138. h_ij0_list.append(h_ij0)
  139. label_list.append(label)
  140. # get the best labels.
  141. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
  142. if allBestEdges: # choose all best graphs.
  143. elabel_best = [label_list[idx] for idx in idx_max]
  144. h_ij0_max = [h_ij0_list[idx] for idx in idx_max]
  145. # generate "best" graphs with regard to "best" node labels.
  146. G_new_list_ed = []
  147. for g_tmp in g_tmp_list: # @todo: seems it can be simplified. The G_new_list will only contain 1 graph for now.
  148. for idxl, el in enumerate(elabel_best):
  149. g_tmp_copy = g_tmp.copy()
  150. # check whether a_ij is 0 or 1.
  151. sij_norm = 0
  152. for idx, g in enumerate(Gn_median):
  153. pi_i = pi_p_forward[idx][nd1i]
  154. pi_j = pi_p_forward[idx][nd2i]
  155. if g.has_node(pi_i) and g.has_node(pi_j) and \
  156. g.has_edge(pi_i, pi_j):
  157. sij_norm += 1
  158. if h_ij0_max[idxl] > len(Gn_median) * c_er / c_es + \
  159. sij_norm * (1 - (c_er + c_ei) / c_es):
  160. if not g_tmp_copy.has_edge(nd1, nd2):
  161. g_tmp_copy.add_edge(nd1, nd2)
  162. g_tmp_copy.edges[nd1, nd2][edge_label] = elabel_best[idxl]
  163. else:
  164. if g_tmp_copy.has_edge(nd1, nd2):
  165. g_tmp_copy.remove_edge(nd1, nd2)
  166. G_new_list_ed.append(g_tmp_copy)
  167. g_tmp_list = [ggg.copy() for ggg in G_new_list_ed]
  168. else: # choose one of the best randomly.
  169. idx_rdm = random.randint(0, len(idx_max) - 1)
  170. best_label = label_list[idx_max[idx_rdm]]
  171. h_ij0_max = h_ij0_list[idx_max[idx_rdm]]
  172. # check whether a_ij is 0 or 1.
  173. sij_norm = 0
  174. for idx, g in enumerate(Gn_median):
  175. pi_i = pi_p_forward[idx][nd1i]
  176. pi_j = pi_p_forward[idx][nd2i]
  177. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  178. sij_norm += 1
  179. if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
  180. if not g_new.has_edge(nd1, nd2):
  181. g_new.add_edge(nd1, nd2)
  182. g_new.edges[nd1, nd2][edge_label] = best_label
  183. else:
  184. # elif h_ij0_max < len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
  185. if g_new.has_edge(nd1, nd2):
  186. g_new.remove_edge(nd1, nd2)
  187. g_tmp_list = [g_new]
  188. G_new_list_edge += g_tmp_list
  189. G_new_list = [ggg.copy() for ggg in G_new_list_edge]
  190. else: # if edges are unlabeled
  191. # @todo: is this even right? G or g_tmp? check if the new one is right
  192. # @todo: works only for undirected graphs.
  193. for g_tmp in G_new_list:
  194. nd_list = [n for n in g_tmp.nodes()]
  195. for nd1i in range(nx.number_of_nodes(g_tmp)):
  196. nd1 = nd_list[nd1i]
  197. for nd2i in range(nd1i + 1, nx.number_of_nodes(g_tmp)):
  198. nd2 = nd_list[nd2i]
  199. sij_norm = 0
  200. for idx, g in enumerate(Gn_median):
  201. pi_i = pi_p_forward[idx][nd1i]
  202. pi_j = pi_p_forward[idx][nd2i]
  203. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  204. sij_norm += 1
  205. if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
  206. # @todo: should we consider if nd1 and nd2 in g_tmp?
  207. # or just add the edge anyway?
  208. if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
  209. and not g_tmp.has_edge(nd1, nd2):
  210. g_tmp.add_edge(nd1, nd2)
  211. else: # @todo: which to use?
  212. # elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
  213. if g_tmp.has_edge(nd1, nd2):
  214. g_tmp.remove_edge(nd1, nd2)
  215. # do not change anything when equal.
  216. # for i, g in enumerate(G_new_list):
  217. # import matplotlib.pyplot as plt
  218. # nx.draw(g, labels=nx.get_node_attributes(g, 'atom'), with_labels=True)
  219. ## plt.savefig("results/gk_iam/simple_two/xx" + str(i) + ".png", format="PNG")
  220. # plt.show()
  221. # print(g.nodes(data=True))
  222. # print(g.edges(data=True))
  223. # # find the best graph generated in this iteration and update pi_p.
  224. # @todo: should we update all graphs generated or just the best ones?
  225. dis_list, pi_forward_list = ged_median(G_new_list, Gn_median,
  226. params_ged=params_ged)
  227. # @todo: should we remove the identical and connectivity check?
  228. # Don't know which is faster.
  229. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
  230. G_new_list, idx_list = remove_duplicates(G_new_list)
  231. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  232. dis_list = [dis_list[idx] for idx in idx_list]
  233. # if connected == True:
  234. # G_new_list, idx_list = remove_disconnected(G_new_list)
  235. # pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  236. # idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
  237. # dis_min = dis_list[idx_min_tmp_list[0]]
  238. # pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
  239. # G_new_list = [G_new_list[idx] for idx in idx_min_list]
  240. # for g in G_new_list:
  241. # import matplotlib.pyplot as plt
  242. # nx.draw_networkx(g)
  243. # plt.show()
  244. # print(g.nodes(data=True))
  245. # print(g.edges(data=True))
  246. return G_new_list, pi_forward_list, dis_list
  247. def best_median_graphs(Gn_candidate, pi_all_forward, dis_all):
  248. idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
  249. dis_min = dis_all[idx_min_list[0]]
  250. pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
  251. G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
  252. return G_min_list, pi_forward_min_list, dis_min
  253. def iteration_proc(G, pi_p_forward, cur_sod):
  254. G_list = [G]
  255. pi_forward_list = [pi_p_forward]
  256. old_sod = cur_sod * 2
  257. sod_list = [cur_sod]
  258. dis_list = [cur_sod]
  259. # iterations.
  260. itr = 0
  261. # @todo: what if difference == 0?
  262. # while itr < ite_max and (np.abs(old_sod - cur_sod) > epsilon or
  263. # np.abs(old_sod - cur_sod) == 0):
  264. while itr < ite_max and np.abs(old_sod - cur_sod) > epsilon:
  265. # while itr < ite_max:
  266. # for itr in range(0, 5): # the convergence condition?
  267. print('itr_iam is', itr)
  268. G_new_list = []
  269. pi_forward_new_list = []
  270. dis_new_list = []
  271. for idx, g in enumerate(G_list):
  272. # label_set = get_node_labels(Gn_median + [g], node_label)
  273. G_tmp_list, pi_forward_tmp_list, dis_tmp_list = generate_graph(
  274. g, pi_forward_list[idx])
  275. G_new_list += G_tmp_list
  276. pi_forward_new_list += pi_forward_tmp_list
  277. dis_new_list += dis_tmp_list
  278. # @todo: need to remove duplicates here?
  279. G_list = [ggg.copy() for ggg in G_new_list]
  280. pi_forward_list = [pitem.copy() for pitem in pi_forward_new_list]
  281. dis_list = dis_new_list[:]
  282. old_sod = cur_sod
  283. cur_sod = np.min(dis_list)
  284. sod_list.append(cur_sod)
  285. itr += 1
  286. # @todo: do we return all graphs or the best ones?
  287. # get the best ones of the generated graphs.
  288. G_list, pi_forward_list, dis_min = best_median_graphs(
  289. G_list, pi_forward_list, dis_list)
  290. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
  291. G_list, idx_list = remove_duplicates(G_list)
  292. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  293. # dis_list = [dis_list[idx] for idx in idx_list]
  294. # import matplotlib.pyplot as plt
  295. # for g in G_list:
  296. # nx.draw_networkx(g)
  297. # plt.show()
  298. # print(g.nodes(data=True))
  299. # print(g.edges(data=True))
  300. print('\nsods:', sod_list, '\n')
  301. return G_list, pi_forward_list, dis_min, sod_list
  302. def remove_duplicates(Gn):
  303. """Remove duplicate graphs from list.
  304. """
  305. Gn_new = []
  306. idx_list = []
  307. for idx, g in enumerate(Gn):
  308. dupl = False
  309. for g_new in Gn_new:
  310. if graph_isIdentical(g_new, g):
  311. dupl = True
  312. break
  313. if not dupl:
  314. Gn_new.append(g)
  315. idx_list.append(idx)
  316. return Gn_new, idx_list
  317. def remove_disconnected(Gn):
  318. """Remove disconnected graphs from list.
  319. """
  320. Gn_new = []
  321. idx_list = []
  322. for idx, g in enumerate(Gn):
  323. if nx.is_connected(g):
  324. Gn_new.append(g)
  325. idx_list.append(idx)
  326. return Gn_new, idx_list
  327. ###########################################################################
  328. # phase 1: initilize.
  329. # compute set-median.
  330. dis_min = np.inf
  331. dis_list, pi_forward_all = ged_median(Gn_candidate, Gn_median,
  332. params_ged=params_ged, parallel=True)
  333. print('finish computing GEDs.')
  334. # find all smallest distances.
  335. if allBestInit: # try all best init graphs.
  336. idx_min_list = range(len(dis_list))
  337. dis_min = dis_list
  338. else:
  339. idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
  340. dis_min = [dis_list[idx_min_list[0]]] * len(idx_min_list)
  341. idx_min_rdm = random.randint(0, len(idx_min_list) - 1)
  342. idx_min_list = [idx_min_list[idx_min_rdm]]
  343. sod_set_median = np.min(dis_min)
  344. # phase 2: iteration.
  345. G_list = []
  346. dis_list = []
  347. pi_forward_list = []
  348. G_set_median_list = []
  349. # sod_list = []
  350. for idx_tmp, idx_min in enumerate(idx_min_list):
  351. # print('idx_min is', idx_min)
  352. G = Gn_candidate[idx_min].copy()
  353. G_set_median_list.append(G.copy())
  354. # list of edit operations.
  355. pi_p_forward = pi_forward_all[idx_min]
  356. # pi_p_backward = pi_all_backward[idx_min]
  357. Gi_list, pi_i_forward_list, dis_i_min, sod_list = iteration_proc(G,
  358. pi_p_forward, dis_min[idx_tmp])
  359. G_list += Gi_list
  360. dis_list += [dis_i_min] * len(Gi_list)
  361. pi_forward_list += pi_i_forward_list
  362. if ds_attrs['node_attr_dim'] == 0 and ds_attrs['edge_attr_dim'] == 0:
  363. G_list, idx_list = remove_duplicates(G_list)
  364. dis_list = [dis_list[idx] for idx in idx_list]
  365. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  366. if connected == True:
  367. G_list_con, idx_list = remove_disconnected(G_list)
  368. # if there is no connected graphs at all, then remain the disconnected ones.
  369. if len(G_list_con) > 0: # @todo: ??????????????????????????
  370. G_list = G_list_con
  371. dis_list = [dis_list[idx] for idx in idx_list]
  372. pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
  373. # import matplotlib.pyplot as plt
  374. # for g in G_list:
  375. # nx.draw_networkx(g)
  376. # plt.show()
  377. # print(g.nodes(data=True))
  378. # print(g.edges(data=True))
  379. # get the best median graphs
  380. G_gen_median_list, pi_forward_min_list, sod_gen_median = best_median_graphs(
  381. G_list, pi_forward_list, dis_list)
  382. # for g in G_gen_median_list:
  383. # nx.draw_networkx(g)
  384. # plt.show()
  385. # print(g.nodes(data=True))
  386. # print(g.edges(data=True))
  387. if not allBestOutput:
  388. # randomly choose one graph.
  389. idx_rdm = random.randint(0, len(G_gen_median_list) - 1)
  390. G_gen_median_list = [G_gen_median_list[idx_rdm]]
  391. return G_gen_median_list, sod_gen_median, sod_list, G_set_median_list, sod_set_median
  392. def iam_bash(Gn_names, edit_cost_constant, cost='CONSTANT',
  393. dataset='monoterpenoides',
  394. graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/'):
  395. """Compute the iam by c++ implementation (gedlib) through bash.
  396. """
  397. import os
  398. import time
  399. def createCollectionFile(Gn_names, y, filename):
  400. """Create collection file.
  401. """
  402. dirname_ds = os.path.dirname(filename)
  403. if dirname_ds != '':
  404. dirname_ds += '/'
  405. if not os.path.exists(dirname_ds) :
  406. os.makedirs(dirname_ds)
  407. with open(filename + '.xml', 'w') as fgroup:
  408. fgroup.write("<?xml version=\"1.0\"?>")
  409. fgroup.write("\n<!DOCTYPE GraphCollection SYSTEM \"http://www.inf.unibz.it/~blumenthal/dtd/GraphCollection.dtd\">")
  410. fgroup.write("\n<GraphCollection>")
  411. for idx, fname in enumerate(Gn_names):
  412. fgroup.write("\n\t<graph file=\"" + fname + "\" class=\"" + str(y[idx]) + "\"/>")
  413. fgroup.write("\n</GraphCollection>")
  414. fgroup.close()
  415. tmp_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/'
  416. fn_collection = tmp_dir + 'collection.' + str(time.time()) + str(random.randint(0, 1e9))
  417. createCollectionFile(Gn_names, ['dummy'] * len(Gn_names), fn_collection)
  418. # graph_dir = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/gxl'
  419. command = 'GEDLIB_HOME=\'/media/ljia/DATA/research-repo/codes/Linlin/gedlib\'\n'
  420. command += 'LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$GEDLIB_HOME/lib\n'
  421. command += 'export LD_LIBRARY_PATH\n'
  422. command += 'cd \'/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/bin\'\n'
  423. command += './iam_for_python_bash ' + dataset + ' ' + fn_collection \
  424. + ' \'' + graph_dir + '\' ' + ' ' + cost + ' '
  425. if edit_cost_constant is None:
  426. command += 'None'
  427. else:
  428. for ec in edit_cost_constant:
  429. command += str(ec) + ' '
  430. # output = os.system(command)
  431. stream = os.popen(command)
  432. output = stream.readlines()
  433. # print(output)
  434. sod_sm = float(output[0].strip())
  435. sod_gm = float(output[1].strip())
  436. fname_sm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/set_median.gxl'
  437. fname_gm = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/output/tmp_ged/gen_median.gxl'
  438. return sod_sm, sod_gm, fname_sm, fname_gm
  439. ###############################################################################
  440. # Old implementations.
  441. def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
  442. connected=True):
  443. """See my name, then you know what I do.
  444. """
  445. # Gn = Gn[0:10]
  446. Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
  447. # phase 1: initilize.
  448. # compute set-median.
  449. dis_min = np.inf
  450. pi_p = []
  451. pi_all = []
  452. for idx1, G_p in enumerate(Gn):
  453. dist_sum = 0
  454. pi_all.append([])
  455. for idx2, G_p_prime in enumerate(Gn):
  456. dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
  457. pi_all[idx1].append(pi_tmp)
  458. dist_sum += dist_tmp
  459. if dist_sum < dis_min:
  460. dis_min = dist_sum
  461. G = G_p.copy()
  462. idx_min = idx1
  463. # list of edit operations.
  464. pi_p = pi_all[idx_min]
  465. # phase 2: iteration.
  466. ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'],
  467. edge_label=edge_label)
  468. for itr in range(0, 10): # @todo: the convergence condition?
  469. G_new = G.copy()
  470. # update vertex labels.
  471. # pre-compute h_i0 for each label.
  472. # for label in get_node_labels(Gn, node_label):
  473. # print(label)
  474. # for nd in G.nodes(data=True):
  475. # pass
  476. if not ds_attrs['node_attr_dim']: # labels are symbolic
  477. for nd, _ in G.nodes(data=True):
  478. h_i0_list = []
  479. label_list = []
  480. for label in get_node_labels(Gn, node_label):
  481. h_i0 = 0
  482. for idx, g in enumerate(Gn):
  483. pi_i = pi_p[idx][nd]
  484. if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
  485. h_i0 += 1
  486. h_i0_list.append(h_i0)
  487. label_list.append(label)
  488. # choose one of the best randomly.
  489. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
  490. idx_rdm = random.randint(0, len(idx_max) - 1)
  491. G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
  492. else: # labels are non-symbolic
  493. for nd, _ in G.nodes(data=True):
  494. Si_norm = 0
  495. phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
  496. for idx, g in enumerate(Gn):
  497. pi_i = pi_p[idx][nd]
  498. if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
  499. Si_norm += 1
  500. phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
  501. phi_i_bar /= Si_norm
  502. G_new.nodes[nd]['attributes'] = phi_i_bar
  503. # update edge labels and adjacency matrix.
  504. if ds_attrs['edge_labeled']:
  505. for nd1, nd2, _ in G.edges(data=True):
  506. h_ij0_list = []
  507. label_list = []
  508. for label in get_edge_labels(Gn, edge_label):
  509. h_ij0 = 0
  510. for idx, g in enumerate(Gn):
  511. pi_i = pi_p[idx][nd1]
  512. pi_j = pi_p[idx][nd2]
  513. h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
  514. g.has_edge(pi_i, pi_j) and
  515. g.edges[pi_i, pi_j][edge_label] == label)
  516. h_ij0 += h_ij0_p
  517. h_ij0_list.append(h_ij0)
  518. label_list.append(label)
  519. # choose one of the best randomly.
  520. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
  521. h_ij0_max = h_ij0_list[idx_max[0]]
  522. idx_rdm = random.randint(0, len(idx_max) - 1)
  523. best_label = label_list[idx_max[idx_rdm]]
  524. # check whether a_ij is 0 or 1.
  525. sij_norm = 0
  526. for idx, g in enumerate(Gn):
  527. pi_i = pi_p[idx][nd1]
  528. pi_j = pi_p[idx][nd2]
  529. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  530. sij_norm += 1
  531. if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
  532. if not G_new.has_edge(nd1, nd2):
  533. G_new.add_edge(nd1, nd2)
  534. G_new.edges[nd1, nd2][edge_label] = best_label
  535. else:
  536. if G_new.has_edge(nd1, nd2):
  537. G_new.remove_edge(nd1, nd2)
  538. else: # if edges are unlabeled
  539. for nd1, nd2, _ in G.edges(data=True):
  540. sij_norm = 0
  541. for idx, g in enumerate(Gn):
  542. pi_i = pi_p[idx][nd1]
  543. pi_j = pi_p[idx][nd2]
  544. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  545. sij_norm += 1
  546. if sij_norm > len(Gn) * c_er / (c_er + c_ei):
  547. if not G_new.has_edge(nd1, nd2):
  548. G_new.add_edge(nd1, nd2)
  549. else:
  550. if G_new.has_edge(nd1, nd2):
  551. G_new.remove_edge(nd1, nd2)
  552. G = G_new.copy()
  553. # update pi_p
  554. pi_p = []
  555. for idx1, G_p in enumerate(Gn):
  556. dist_tmp, pi_tmp, _ = GED(G, G_p)
  557. pi_p.append(pi_tmp)
  558. return G
  559. # --------------------------- These are tests --------------------------------#
  560. def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1,
  561. node_label='atom', edge_label='bond_type'):
  562. """See my name, then you know what I do.
  563. """
  564. # Gn = Gn[0:10]
  565. Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
  566. # phase 1: initilize.
  567. # compute set-median.
  568. dis_min = np.inf
  569. # pi_p = []
  570. pi_all_forward = []
  571. pi_all_backward = []
  572. for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout):
  573. dist_sum = 0
  574. pi_all_forward.append([])
  575. pi_all_backward.append([])
  576. for idx2, G_p_prime in enumerate(Gn):
  577. dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime)
  578. pi_all_forward[idx1].append(pi_tmp_forward)
  579. pi_all_backward[idx1].append(pi_tmp_backward)
  580. dist_sum += dist_tmp
  581. if dist_sum <= dis_min:
  582. dis_min = dist_sum
  583. G = G_p.copy()
  584. idx_min = idx1
  585. # list of edit operations.
  586. pi_p_forward = pi_all_forward[idx_min]
  587. pi_p_backward = pi_all_backward[idx_min]
  588. # phase 2: iteration.
  589. ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
  590. edge_label=edge_label)
  591. label_set = get_node_labels(Gn + [G], node_label)
  592. for itr in range(0, 10): # @todo: the convergence condition?
  593. G_new = G.copy()
  594. # update vertex labels.
  595. # pre-compute h_i0 for each label.
  596. # for label in get_node_labels(Gn, node_label):
  597. # print(label)
  598. # for nd in G.nodes(data=True):
  599. # pass
  600. if not ds_attrs['node_attr_dim']: # labels are symbolic
  601. for nd in G.nodes():
  602. h_i0_list = []
  603. label_list = []
  604. for label in label_set:
  605. h_i0 = 0
  606. for idx, g in enumerate(Gn):
  607. pi_i = pi_p_forward[idx][nd]
  608. if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
  609. h_i0 += 1
  610. h_i0_list.append(h_i0)
  611. label_list.append(label)
  612. # choose one of the best randomly.
  613. idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
  614. idx_rdm = random.randint(0, len(idx_max) - 1)
  615. G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
  616. else: # labels are non-symbolic
  617. for nd in G.nodes():
  618. Si_norm = 0
  619. phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
  620. for idx, g in enumerate(Gn):
  621. pi_i = pi_p_forward[idx][nd]
  622. if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
  623. Si_norm += 1
  624. phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
  625. phi_i_bar /= Si_norm
  626. G_new.nodes[nd]['attributes'] = phi_i_bar
  627. # update edge labels and adjacency matrix.
  628. if ds_attrs['edge_labeled']:
  629. for nd1, nd2, _ in G.edges(data=True):
  630. h_ij0_list = []
  631. label_list = []
  632. for label in get_edge_labels(Gn, edge_label):
  633. h_ij0 = 0
  634. for idx, g in enumerate(Gn):
  635. pi_i = pi_p_forward[idx][nd1]
  636. pi_j = pi_p_forward[idx][nd2]
  637. h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
  638. g.has_edge(pi_i, pi_j) and
  639. g.edges[pi_i, pi_j][edge_label] == label)
  640. h_ij0 += h_ij0_p
  641. h_ij0_list.append(h_ij0)
  642. label_list.append(label)
  643. # choose one of the best randomly.
  644. idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
  645. h_ij0_max = h_ij0_list[idx_max[0]]
  646. idx_rdm = random.randint(0, len(idx_max) - 1)
  647. best_label = label_list[idx_max[idx_rdm]]
  648. # check whether a_ij is 0 or 1.
  649. sij_norm = 0
  650. for idx, g in enumerate(Gn):
  651. pi_i = pi_p_forward[idx][nd1]
  652. pi_j = pi_p_forward[idx][nd2]
  653. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  654. sij_norm += 1
  655. if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
  656. if not G_new.has_edge(nd1, nd2):
  657. G_new.add_edge(nd1, nd2)
  658. G_new.edges[nd1, nd2][edge_label] = best_label
  659. else:
  660. if G_new.has_edge(nd1, nd2):
  661. G_new.remove_edge(nd1, nd2)
  662. else: # if edges are unlabeled
  663. # @todo: works only for undirected graphs.
  664. for nd1 in range(nx.number_of_nodes(G)):
  665. for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
  666. sij_norm = 0
  667. for idx, g in enumerate(Gn):
  668. pi_i = pi_p_forward[idx][nd1]
  669. pi_j = pi_p_forward[idx][nd2]
  670. if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
  671. sij_norm += 1
  672. if sij_norm > len(Gn) * c_er / (c_er + c_ei):
  673. if not G_new.has_edge(nd1, nd2):
  674. G_new.add_edge(nd1, nd2)
  675. elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
  676. if G_new.has_edge(nd1, nd2):
  677. G_new.remove_edge(nd1, nd2)
  678. # do not change anything when equal.
  679. G = G_new.copy()
  680. # update pi_p
  681. pi_p_forward = []
  682. for G_p in Gn:
  683. dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
  684. pi_p_forward.append(pi_tmp_forward)
  685. return G
  686. ###############################################################################
  687. if __name__ == '__main__':
  688. from pygraph.utils.graphfiles import loadDataset
  689. ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
  690. 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
  691. # ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
  692. # 'extra_params': {}} # node nsymb
  693. # ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
  694. # 'extra_params': {}}
  695. Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
  696. iam(Gn)

A Python package for graph kernels, graph edit distances and graph pre-image problem.