You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

graphdataset.py 8.4 kB


  1. """ Obtain all kinds of attributes of a graph dataset.
  2. """
  3. def get_dataset_attributes(Gn,
  4. target=None,
  5. attr_names=[],
  6. node_label=None,
  7. edge_label=None):
  8. import networkx as nx
  9. import numpy as np
  10. attrs = {}
  11. def get_dataset_size(Gn):
  12. return len(Gn)
  13. def get_all_node_num(Gn):
  14. return [nx.number_of_nodes(G) for G in Gn]
  15. def get_ave_node_num(all_node_num):
  16. return np.mean(all_node_num)
  17. def get_min_node_num(all_node_num):
  18. return np.amin(all_node_num)
  19. def get_max_node_num(all_node_num):
  20. return np.amax(all_node_num)
  21. def get_all_edge_num(Gn):
  22. return [nx.number_of_edges(G) for G in Gn]
  23. def get_ave_edge_num(all_edge_num):
  24. return np.mean(all_edge_num)
  25. def get_min_edge_num(all_edge_num):
  26. return np.amin(all_edge_num)
  27. def get_max_edge_num(all_edge_num):
  28. return np.amax(all_edge_num)
  29. def is_node_labeled(Gn):
  30. return False if node_label is None else True
  31. def get_node_label_num(Gn):
  32. nl = set()
  33. for G in Gn:
  34. nl = nl | set(nx.get_node_attributes(G, node_label).values())
  35. return len(nl)
  36. def is_edge_labeled(Gn):
  37. return False if edge_label is None else True
  38. def get_edge_label_num(Gn):
  39. el = set()
  40. for G in Gn:
  41. el = el | set(nx.get_edge_attributes(G, edge_label).values())
  42. return len(el)
  43. def is_directed(Gn):
  44. return nx.is_directed(Gn[0])
  45. def get_ave_node_degree(Gn):
  46. return np.mean([np.mean(list(dict(G.degree()).values())) for G in Gn])
  47. def get_max_node_degree(Gn):
  48. return np.amax([np.mean(list(dict(G.degree()).values())) for G in Gn])
  49. def get_min_node_degree(Gn):
  50. return np.amin([np.mean(list(dict(G.degree()).values())) for G in Gn])
  51. # get fill factor, the number of non-zero entries in the adjacency matrix.
  52. def get_ave_fill_factor(Gn):
  53. return np.mean([nx.number_of_edges(G) / (nx.number_of_nodes(G)
  54. * nx.number_of_nodes(G)) for G in Gn])
  55. def get_max_fill_factor(Gn):
  56. return np.amax([nx.number_of_edges(G) / (nx.number_of_nodes(G)
  57. * nx.number_of_nodes(G)) for G in Gn])
  58. def get_min_fill_factor(Gn):
  59. return np.amin([nx.number_of_edges(G) / (nx.number_of_nodes(G)
  60. * nx.number_of_nodes(G)) for G in Gn])
  61. def get_substructures(Gn):
  62. subs = set()
  63. for G in Gn:
  64. degrees = list(dict(G.degree()).values())
  65. if any(i == 2 for i in degrees):
  66. subs.add('linear')
  67. if np.amax(degrees) >= 3:
  68. subs.add('non linear')
  69. if 'linear' in subs and 'non linear' in subs:
  70. break
  71. if is_directed(Gn):
  72. for G in Gn:
  73. if len(list(nx.find_cycle(G))) > 0:
  74. subs.add('cyclic')
  75. break
  76. # else:
  77. # # @todo: this method does not work for big graph with large amount of edges like D&D, try a better way.
  78. # upper = np.amin([nx.number_of_edges(G) for G in Gn]) * 2 + 10
  79. # for G in Gn:
  80. # if (nx.number_of_edges(G) < upper):
  81. # cyc = list(nx.simple_cycles(G.to_directed()))
  82. # if any(len(i) > 2 for i in cyc):
  83. # subs.add('cyclic')
  84. # break
  85. # if 'cyclic' not in subs:
  86. # for G in Gn:
  87. # cyc = list(nx.simple_cycles(G.to_directed()))
  88. # if any(len(i) > 2 for i in cyc):
  89. # subs.add('cyclic')
  90. # break
  91. return subs
  92. def get_class_num(target):
  93. return len(set(target))
  94. def get_node_attr_dim(Gn):
  95. for G in Gn:
  96. for n in G.nodes(data=True):
  97. if 'attributes' in n[1]:
  98. return len(n[1]['attributes'])
  99. return 0
  100. def get_edge_attr_dim(Gn):
  101. for G in Gn:
  102. if nx.number_of_edges(G) > 0:
  103. for e in G.edges(data=True):
  104. if 'attributes' in e[2]:
  105. return len(e[2]['attributes'])
  106. return 0
  107. if attr_names == []:
  108. attr_names = [
  109. 'substructures',
  110. 'node_labeled',
  111. 'edge_labeled',
  112. 'is_directed',
  113. 'dataset_size',
  114. 'ave_node_num',
  115. 'min_node_num',
  116. 'max_node_num',
  117. 'ave_edge_num',
  118. 'min_edge_num',
  119. 'max_edge_num',
  120. 'ave_node_degree',
  121. 'min_node_degree',
  122. 'max_node_degree',
  123. 'ave_fill_factor',
  124. 'min_fill_factor',
  125. 'max_fill_factor',
  126. 'node_label_num',
  127. 'edge_label_num',
  128. 'node_attr_dim',
  129. 'edge_attr_dim',
  130. 'class_number',
  131. ]
  132. # dataset size
  133. if 'dataset_size' in attr_names:
  134. attrs.update({'dataset_size': get_dataset_size(Gn)})
  135. # graph node number
  136. if any(i in attr_names
  137. for i in ['ave_node_num', 'min_node_num', 'max_node_num']):
  138. all_node_num = get_all_node_num(Gn)
  139. if 'ave_node_num' in attr_names:
  140. attrs.update({'ave_node_num': get_ave_node_num(all_node_num)})
  141. if 'min_node_num' in attr_names:
  142. attrs.update({'min_node_num': get_min_node_num(all_node_num)})
  143. if 'max_node_num' in attr_names:
  144. attrs.update({'max_node_num': get_max_node_num(all_node_num)})
  145. # graph edge number
  146. if any(i in attr_names for i in
  147. ['ave_edge_num', 'min_edge_num', 'max_edge_num']):
  148. all_edge_num = get_all_edge_num(Gn)
  149. if 'ave_edge_num' in attr_names:
  150. attrs.update({'ave_edge_num': get_ave_edge_num(all_edge_num)})
  151. if 'max_edge_num' in attr_names:
  152. attrs.update({'max_edge_num': get_max_edge_num(all_edge_num)})
  153. if 'min_edge_num' in attr_names:
  154. attrs.update({'min_edge_num': get_min_edge_num(all_edge_num)})
  155. # label number
  156. if any(i in attr_names for i in ['node_labeled', 'node_label_num']):
  157. is_nl = is_node_labeled(Gn)
  158. node_label_num = get_node_label_num(Gn)
  159. if 'node_labeled' in attr_names:
  160. # graphs are considered node unlabeled if all nodes have the same label.
  161. attrs.update({'node_labeled': is_nl if node_label_num > 1 else False})
  162. if 'node_label_num' in attr_names:
  163. attrs.update({'node_label_num': node_label_num})
  164. if any(i in attr_names for i in ['edge_labeled', 'edge_label_num']):
  165. is_el = is_edge_labeled(Gn)
  166. edge_label_num = get_edge_label_num(Gn)
  167. if 'edge_labeled' in attr_names:
  168. # graphs are considered edge unlabeled if all edges have the same label.
  169. attrs.update({'edge_labeled': is_el if edge_label_num > 1 else False})
  170. if 'edge_label_num' in attr_names:
  171. attrs.update({'edge_label_num': edge_label_num})
  172. if 'is_directed' in attr_names:
  173. attrs.update({'is_directed': is_directed(Gn)})
  174. if 'ave_node_degree' in attr_names:
  175. attrs.update({'ave_node_degree': get_ave_node_degree(Gn)})
  176. if 'max_node_degree' in attr_names:
  177. attrs.update({'max_node_degree': get_max_node_degree(Gn)})
  178. if 'min_node_degree' in attr_names:
  179. attrs.update({'min_node_degree': get_min_node_degree(Gn)})
  180. if 'ave_fill_factor' in attr_names:
  181. attrs.update({'ave_fill_factor': get_ave_fill_factor(Gn)})
  182. if 'max_fill_factor' in attr_names:
  183. attrs.update({'max_fill_factor': get_max_fill_factor(Gn)})
  184. if 'min_fill_factor' in attr_names:
  185. attrs.update({'min_fill_factor': get_min_fill_factor(Gn)})
  186. if 'substructures' in attr_names:
  187. attrs.update({'substructures': get_substructures(Gn)})
  188. if 'class_number' in attr_names:
  189. attrs.update({'class_number': get_class_num(target)})
  190. if 'node_attr_dim' in attr_names:
  191. attrs['node_attr_dim'] = get_node_attr_dim(Gn)
  192. if 'edge_attr_dim' in attr_names:
  193. attrs['edge_attr_dim'] = get_edge_attr_dim(Gn)
  194. from collections import OrderedDict
  195. return OrderedDict(
  196. sorted(attrs.items(), key=lambda i: attr_names.index(i[0])))

A Python package for graph kernels, graph edit distances and graph pre-image problem.