You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

get_dataset_attributes.ipynb 23 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {
  7. "scrolled": false
  8. },
  9. "outputs": [
  10. {
  11. "name": "stdout",
  12. "output_type": "stream",
  13. "text": [
  14. "\n",
  15. "Acyclic:\n",
  16. "substructures : {'non linear', 'linear'}\n",
  17. "node_labeled : True\n",
  18. "edge_labeled : False\n",
  19. "is_directed : False\n",
  20. "dataset_size : 183\n",
  21. "ave_node_num : 8.153005464480874\n",
  22. "min_node_num : 3\n",
  23. "max_node_num : 11\n",
  24. "ave_edge_num : 7.1530054644808745\n",
  25. "min_edge_num : 2\n",
  26. "max_edge_num : 10\n",
  27. "ave_node_degree : 1.737561012151176\n",
  28. "min_node_degree : 1.3333333333333333\n",
  29. "max_node_degree : 1.8181818181818181\n",
  30. "ave_fill_factor : 0.11241161841596808\n",
  31. "min_fill_factor : 0.08264462809917356\n",
  32. "max_fill_factor : 0.2222222222222222\n",
  33. "node_label_num : 3\n",
  34. "edge_label_num : 1\n",
  35. "node_attr_dim : 0\n",
  36. "edge_attr_dim : 0\n",
  37. "class_number : 148\n",
  38. "\n",
  39. "\n",
  40. "Alkane:\n",
  41. "substructures : {'non linear', 'linear'}\n",
  42. "node_labeled : True\n",
  43. "edge_labeled : False\n",
  44. "is_directed : False\n",
  45. "dataset_size : 150\n",
  46. "ave_node_num : 8.873333333333333\n",
  47. "min_node_num : 1\n",
  48. "max_node_num : 10\n",
  49. "ave_edge_num : 7.873333333333333\n",
  50. "min_edge_num : 0\n",
  51. "max_edge_num : 9\n",
  52. "ave_node_degree : 1.7507830687830694\n",
  53. "min_node_degree : 0.0\n",
  54. "max_node_degree : 1.8\n",
  55. "ave_fill_factor : 0.10199498404299989\n",
  56. "min_fill_factor : 0.0\n",
  57. "max_fill_factor : 0.25\n",
  58. "node_label_num : 2\n",
  59. "edge_label_num : 1\n",
  60. "node_attr_dim : 0\n",
  61. "edge_attr_dim : 0\n",
  62. "class_number : 123\n",
  63. "\n",
  64. "\n",
  65. "MAO:\n",
  66. "substructures : {'non linear', 'linear'}\n",
  67. "node_labeled : True\n",
  68. "edge_labeled : True\n",
  69. "is_directed : False\n",
  70. "dataset_size : 68\n",
  71. "ave_node_num : 18.38235294117647\n",
  72. "min_node_num : 11\n",
  73. "max_node_num : 27\n",
  74. "ave_edge_num : 19.63235294117647\n",
  75. "min_edge_num : 12\n",
  76. "max_edge_num : 29\n",
  77. "ave_node_degree : 2.1347114940751464\n",
  78. "min_node_degree : 2.090909090909091\n",
  79. "max_node_degree : 2.2\n",
  80. "ave_fill_factor : 0.060638921710159575\n",
  81. "min_fill_factor : 0.039780521262002745\n",
  82. "max_fill_factor : 0.09917355371900827\n",
  83. "node_label_num : 3\n",
  84. "edge_label_num : 4\n",
  85. "node_attr_dim : 0\n",
  86. "edge_attr_dim : 0\n",
  87. "class_number : 2\n",
  88. "\n",
  89. "\n",
  90. "PAH:\n",
  91. "substructures : {'non linear', 'linear'}\n",
  92. "node_labeled : False\n",
  93. "edge_labeled : False\n",
  94. "is_directed : False\n",
  95. "dataset_size : 94\n",
  96. "ave_node_num : 20.70212765957447\n",
  97. "min_node_num : 10\n",
  98. "max_node_num : 28\n",
  99. "ave_edge_num : 24.425531914893618\n",
  100. "min_edge_num : 11\n",
  101. "max_edge_num : 34\n",
  102. "ave_node_degree : 2.3550919704450077\n",
  103. "min_node_degree : 2.2\n",
  104. "max_node_degree : 2.5\n",
  105. "ave_fill_factor : 0.05799294134806485\n",
  106. "min_fill_factor : 0.04336734693877551\n",
  107. "max_fill_factor : 0.11\n",
  108. "node_label_num : 1\n",
  109. "edge_label_num : 1\n",
  110. "node_attr_dim : 0\n",
  111. "edge_attr_dim : 0\n",
  112. "class_number : 2\n",
  113. "\n",
  114. "\n",
  115. "MUTAG:\n",
  116. "substructures : {'non linear', 'linear'}\n",
  117. "node_labeled : True\n",
  118. "edge_labeled : True\n",
  119. "is_directed : False\n",
  120. "dataset_size : 188\n",
  121. "ave_node_num : 17.930851063829788\n",
  122. "min_node_num : 10\n",
  123. "max_node_num : 28\n",
  124. "ave_edge_num : 19.79255319148936\n",
  125. "min_edge_num : 10\n",
  126. "max_edge_num : 33\n",
  127. "ave_node_degree : 2.1887720785524962\n",
  128. "min_node_degree : 2.0\n",
  129. "max_node_degree : 2.4444444444444446\n",
  130. "ave_fill_factor : 0.06480462822996713\n",
  131. "min_fill_factor : 0.039540816326530615\n",
  132. "max_fill_factor : 0.1\n",
  133. "node_label_num : 7\n",
  134. "edge_label_num : 11\n",
  135. "node_attr_dim : 0\n",
  136. "edge_attr_dim : 0\n",
  137. "class_number : 2\n",
  138. "\n",
  139. "\n",
  140. "Letter-med:\n",
  141. "substructures : {'non linear', 'linear'}\n",
  142. "node_labeled : False\n",
  143. "edge_labeled : False\n",
  144. "is_directed : False\n",
  145. "dataset_size : 2250\n",
  146. "ave_node_num : 4.674666666666667\n",
  147. "min_node_num : 1\n",
  148. "max_node_num : 9\n",
  149. "ave_edge_num : 3.2057777777777776\n",
  150. "min_edge_num : 0\n",
  151. "max_edge_num : 7\n",
  152. "ave_node_degree : 1.35270582010582\n",
  153. "min_node_degree : 0.0\n",
  154. "max_node_degree : 2.4\n",
  155. "ave_fill_factor : 0.15517701625094482\n",
  156. "min_fill_factor : 0.0\n",
  157. "max_fill_factor : 0.3333333333333333\n",
  158. "node_label_num : 0\n",
  159. "edge_label_num : 0\n",
  160. "node_attr_dim : 2\n",
  161. "edge_attr_dim : 0\n",
  162. "class_number : 15\n",
  163. "\n",
  164. "\n",
  165. "ENZYMES:\n",
  166. "substructures : {'non linear', 'linear'}\n",
  167. "node_labeled : True\n",
  168. "edge_labeled : False\n",
  169. "is_directed : False\n",
  170. "dataset_size : 600\n",
  171. "ave_node_num : 32.63333333333333\n",
  172. "min_node_num : 2\n",
  173. "max_node_num : 126\n",
  174. "ave_edge_num : 62.13666666666666\n",
  175. "min_edge_num : 1\n",
  176. "max_edge_num : 149\n",
  177. "ave_node_degree : 3.862625314410413\n",
  178. "min_node_degree : 0.32\n",
  179. "max_node_degree : 5.230769230769231\n",
  180. "ave_fill_factor : 0.07509817146721588\n",
  181. "min_fill_factor : 0.0016\n",
  182. "max_fill_factor : 0.375\n",
  183. "node_label_num : 3\n",
  184. "edge_label_num : 0\n",
  185. "node_attr_dim : 18\n",
  186. "edge_attr_dim : 0\n",
  187. "class_number : 6\n",
  188. "\n",
  189. "\n",
  190. "Mutagenicity:\n",
  191. "substructures : {'non linear', 'linear'}\n",
  192. "node_labeled : True\n",
  193. "edge_labeled : True\n",
  194. "is_directed : False\n",
  195. "dataset_size : 4337\n",
  196. "ave_node_num : 30.317731150564907\n",
  197. "min_node_num : 4\n",
  198. "max_node_num : 417\n",
  199. "ave_edge_num : 30.76942587041734\n",
  200. "min_edge_num : 3\n",
  201. "max_edge_num : 112\n",
  202. "ave_node_degree : 2.0379886162441148\n",
  203. "min_node_degree : 0.47961630695443647\n",
  204. "max_node_degree : 2.3703703703703702\n",
  205. "ave_fill_factor : 0.0431047931997047\n",
  206. "min_fill_factor : 0.0005750795047415305\n",
  207. "max_fill_factor : 0.1875\n",
  208. "node_label_num : 14\n",
  209. "edge_label_num : 3\n",
  210. "node_attr_dim : 0\n",
  211. "edge_attr_dim : 0\n",
  212. "class_number : 2\n",
  213. "\n",
  214. "\n",
  215. "D&D:\n",
  216. "substructures : {'non linear', 'linear'}\n",
  217. "node_labeled : True\n",
  218. "edge_labeled : False\n",
  219. "is_directed : False\n",
  220. "dataset_size : 1178\n",
  221. "ave_node_num : 284.3166383701188\n",
  222. "min_node_num : 30\n",
  223. "max_node_num : 5748\n",
  224. "ave_edge_num : 715.6587436332767\n",
  225. "min_edge_num : 63\n",
  226. "max_edge_num : 14267\n",
  227. "ave_node_degree : 4.979061662020889\n",
  228. "min_node_degree : 3.6116504854368934\n",
  229. "max_node_degree : 8.933333333333334\n",
  230. "ave_fill_factor : 0.013790239865199101\n",
  231. "min_fill_factor : 0.0004318164098347239\n",
  232. "max_fill_factor : 0.09666666666666666\n",
  233. "node_label_num : 82\n",
  234. "edge_label_num : 0\n",
  235. "node_attr_dim : 0\n",
  236. "edge_attr_dim : 0\n",
  237. "class_number : 2\n",
  238. "\n",
  239. "\n",
  240. "AIDS:\n",
  241. "substructures : {'non linear', 'linear'}\n",
  242. "node_labeled : True\n",
  243. "edge_labeled : True\n",
  244. "is_directed : False\n",
  245. "dataset_size : 2000\n",
  246. "ave_node_num : 15.6925\n",
  247. "min_node_num : 2\n",
  248. "max_node_num : 95\n",
  249. "ave_edge_num : 16.195\n",
  250. "min_edge_num : 1\n",
  251. "max_edge_num : 103\n",
  252. "ave_node_degree : 2.012865369646626\n",
  253. "min_node_degree : 0.6\n",
  254. "max_node_degree : 2.8333333333333335\n",
  255. "ave_fill_factor : 0.08679744688995196\n",
  256. "min_fill_factor : 0.011412742382271468\n",
  257. "max_fill_factor : 0.25\n",
  258. "node_label_num : 38\n",
  259. "edge_label_num : 3\n",
  260. "node_attr_dim : 4\n",
  261. "edge_attr_dim : 0\n",
  262. "class_number : 2\n",
  263. "\n",
  264. "\n",
  265. "FIRSTMM_DB:\n",
  266. "substructures : {'non linear'}\n",
  267. "node_labeled : True\n",
  268. "edge_labeled : False\n",
  269. "is_directed : False\n",
  270. "dataset_size : 41\n",
  271. "ave_node_num : 1377.2682926829268\n",
  272. "min_node_num : 134\n",
  273. "max_node_num : 5037\n",
  274. "ave_edge_num : 3074.0975609756097\n",
  275. "min_edge_num : 320\n",
  276. "max_edge_num : 10888\n",
  277. "ave_node_degree : 4.503061007447199\n",
  278. "min_node_degree : 4.191919191919192\n",
  279. "max_node_degree : 4.776119402985074\n",
  280. "ave_fill_factor : 0.003689884678097613\n",
  281. "min_fill_factor : 0.00042914515176536197\n",
  282. "max_fill_factor : 0.017821341055914458\n",
  283. "node_label_num : 5\n",
  284. "edge_label_num : 0\n",
  285. "node_attr_dim : 1\n",
  286. "edge_attr_dim : 2\n",
  287. "class_number : 11\n",
  288. "\n",
  289. "\n",
  290. "MSRC9:\n",
  291. "substructures : {'non linear', 'linear'}\n",
  292. "node_labeled : True\n",
  293. "edge_labeled : False\n",
  294. "is_directed : False\n",
  295. "dataset_size : 221\n",
  296. "ave_node_num : 40.57918552036199\n",
  297. "min_node_num : 25\n",
  298. "max_node_num : 55\n",
  299. "ave_edge_num : 97.9366515837104\n",
  300. "min_edge_num : 53\n",
  301. "max_edge_num : 145\n",
  302. "ave_node_degree : 4.8153400199203436\n",
  303. "min_node_degree : 4.176470588235294\n",
  304. "max_node_degree : 5.576923076923077\n",
  305. "ave_fill_factor : 0.06021937645679636\n",
  306. "min_fill_factor : 0.04521181915272339\n",
  307. "max_fill_factor : 0.0848\n",
  308. "node_label_num : 10\n",
  309. "edge_label_num : 0\n",
  310. "node_attr_dim : 0\n",
  311. "edge_attr_dim : 0\n",
  312. "class_number : 8\n",
  313. "\n",
  314. "\n",
  315. "MSRC21:\n",
  316. "substructures : {'non linear', 'linear'}\n",
  317. "node_labeled : True\n",
  318. "edge_labeled : False\n",
  319. "is_directed : False\n",
  320. "dataset_size : 563\n",
  321. "ave_node_num : 77.52042628774423\n",
  322. "min_node_num : 51\n",
  323. "max_node_num : 141\n",
  324. "ave_edge_num : 198.32326820603907\n",
  325. "min_edge_num : 121\n",
  326. "max_edge_num : 405\n",
  327. "ave_node_degree : 5.102391320310953\n",
  328. "min_node_degree : 4.04\n",
  329. "max_node_degree : 6.6\n",
  330. "ave_fill_factor : 0.03357132864022473\n",
  331. "min_fill_factor : 0.01873405612244898\n",
  332. "max_fill_factor : 0.04652056901191849\n",
  333. "node_label_num : 22\n",
  334. "edge_label_num : 0\n",
  335. "node_attr_dim : 0\n",
  336. "edge_attr_dim : 0\n",
  337. "class_number : 20\n",
  338. "\n",
  339. "\n",
  340. "SYNTHETIC:\n",
  341. "substructures : {'non linear', 'linear'}\n",
  342. "node_labeled : True\n",
  343. "edge_labeled : False\n",
  344. "is_directed : False\n",
  345. "dataset_size : 300\n",
  346. "ave_node_num : 100.0\n",
  347. "min_node_num : 100\n",
  348. "max_node_num : 100\n",
  349. "ave_edge_num : 196.0\n",
  350. "min_edge_num : 196\n",
  351. "max_edge_num : 196\n",
  352. "ave_node_degree : 3.9200000000000017\n",
  353. "min_node_degree : 3.92\n",
  354. "max_node_degree : 3.92\n",
  355. "ave_fill_factor : 0.019600000000000003\n",
  356. "min_fill_factor : 0.0196\n",
  357. "max_fill_factor : 0.0196\n",
  358. "node_label_num : 8\n",
  359. "edge_label_num : 0\n",
  360. "node_attr_dim : 1\n",
  361. "edge_attr_dim : 0\n",
  362. "class_number : 2\n",
  363. "\n",
  364. "\n",
  365. "BZR:\n",
  366. "substructures : {'non linear', 'linear'}\n",
  367. "node_labeled : True\n",
  368. "edge_labeled : False\n",
  369. "is_directed : False\n",
  370. "dataset_size : 405\n",
  371. "ave_node_num : 35.75061728395062\n",
  372. "min_node_num : 13\n",
  373. "max_node_num : 57\n",
  374. "ave_edge_num : 38.358024691358025\n",
  375. "min_edge_num : 13\n",
  376. "max_edge_num : 60\n",
  377. "ave_node_degree : 2.1466610247664697\n",
  378. "min_node_degree : 2.0\n",
  379. "max_node_degree : 2.2777777777777777\n",
  380. "ave_fill_factor : 0.0314385616191916\n",
  381. "min_fill_factor : 0.017851646660510926\n",
  382. "max_fill_factor : 0.07692307692307693\n",
  383. "node_label_num : 10\n",
  384. "edge_label_num : 0\n",
  385. "node_attr_dim : 3\n",
  386. "edge_attr_dim : 0\n",
  387. "class_number : 2\n",
  388. "\n"
  389. ]
  390. },
  391. {
  392. "name": "stdout",
  393. "output_type": "stream",
  394. "text": [
  395. "\n",
  396. "COX2:\n",
  397. "substructures : {'non linear', 'linear'}\n",
  398. "node_labeled : True\n",
  399. "edge_labeled : False\n",
  400. "is_directed : False\n",
  401. "dataset_size : 467\n",
  402. "ave_node_num : 41.224839400428266\n",
  403. "min_node_num : 32\n",
  404. "max_node_num : 56\n",
  405. "ave_edge_num : 43.44539614561028\n",
  406. "min_edge_num : 34\n",
  407. "max_edge_num : 59\n",
  408. "ave_node_degree : 2.1077350079995685\n",
  409. "min_node_degree : 2.076923076923077\n",
  410. "max_node_degree : 2.1739130434782608\n",
  411. "ave_fill_factor : 0.025799177869507202\n",
  412. "min_fill_factor : 0.01881377551020408\n",
  413. "max_fill_factor : 0.033203125\n",
  414. "node_label_num : 8\n",
  415. "edge_label_num : 0\n",
  416. "node_attr_dim : 3\n",
  417. "edge_attr_dim : 0\n",
  418. "class_number : 2\n",
  419. "\n",
  420. "\n",
  421. "DHFR:\n",
  422. "substructures : {'non linear', 'linear'}\n",
  423. "node_labeled : True\n",
  424. "edge_labeled : False\n",
  425. "is_directed : False\n",
  426. "dataset_size : 756\n",
  427. "ave_node_num : 42.42724867724868\n",
  428. "min_node_num : 20\n",
  429. "max_node_num : 71\n",
  430. "ave_edge_num : 44.544973544973544\n",
  431. "min_edge_num : 21\n",
  432. "max_edge_num : 73\n",
  433. "ave_node_degree : 2.102359895640024\n",
  434. "min_node_degree : 2.0338983050847457\n",
  435. "max_node_degree : 2.2\n",
  436. "ave_fill_factor : 0.026126638866896944\n",
  437. "min_fill_factor : 0.0144812537195001\n",
  438. "max_fill_factor : 0.0525\n",
  439. "node_label_num : 9\n",
  440. "edge_label_num : 0\n",
  441. "node_attr_dim : 3\n",
  442. "edge_attr_dim : 0\n",
  443. "class_number : 2\n",
  444. "\n",
  445. "\n",
  446. "PROTEINS:\n",
  447. "substructures : {'non linear', 'linear'}\n",
  448. "node_labeled : True\n",
  449. "edge_labeled : False\n",
  450. "is_directed : False\n",
  451. "dataset_size : 1113\n",
  452. "ave_node_num : 39.05750224618149\n",
  453. "min_node_num : 4\n",
  454. "max_node_num : 620\n",
  455. "ave_edge_num : 72.8158131176999\n",
  456. "min_edge_num : 5\n",
  457. "max_edge_num : 1049\n",
  458. "ave_node_degree : 3.734642171150555\n",
  459. "min_node_degree : 1.7142857142857142\n",
  460. "max_node_degree : 5.071428571428571\n",
  461. "ave_fill_factor : 0.09599853508460923\n",
  462. "min_fill_factor : 0.0027289281997918834\n",
  463. "max_fill_factor : 0.375\n",
  464. "node_label_num : 3\n",
  465. "edge_label_num : 0\n",
  466. "node_attr_dim : 1\n",
  467. "edge_attr_dim : 0\n",
  468. "class_number : 2\n",
  469. "\n",
  470. "\n",
  471. "PROTEINS_full:\n",
  472. "substructures : {'non linear', 'linear'}\n",
  473. "node_labeled : True\n",
  474. "edge_labeled : False\n",
  475. "is_directed : False\n",
  476. "dataset_size : 1113\n",
  477. "ave_node_num : 39.05750224618149\n",
  478. "min_node_num : 4\n",
  479. "max_node_num : 620\n",
  480. "ave_edge_num : 72.8158131176999\n",
  481. "min_edge_num : 5\n",
  482. "max_edge_num : 1049\n",
  483. "ave_node_degree : 3.734642171150555\n",
  484. "min_node_degree : 1.7142857142857142\n",
  485. "max_node_degree : 5.071428571428571\n",
  486. "ave_fill_factor : 0.09599853508460923\n",
  487. "min_fill_factor : 0.0027289281997918834\n",
  488. "max_fill_factor : 0.375\n",
  489. "node_label_num : 3\n",
  490. "edge_label_num : 0\n",
  491. "node_attr_dim : 29\n",
  492. "edge_attr_dim : 0\n",
  493. "class_number : 2\n",
  494. "\n",
  495. "\n",
  496. "NCI1:\n",
  497. "substructures : {'non linear', 'linear'}\n",
  498. "node_labeled : True\n",
  499. "edge_labeled : False\n",
  500. "is_directed : False\n",
  501. "dataset_size : 4110\n",
  502. "ave_node_num : 29.8654501216545\n",
  503. "min_node_num : 3\n",
  504. "max_node_num : 111\n",
  505. "ave_edge_num : 32.3\n",
  506. "min_edge_num : 2\n",
  507. "max_edge_num : 119\n",
  508. "ave_node_degree : 2.155013792267071\n",
  509. "min_node_degree : 0.8\n",
  510. "max_node_degree : 2.769230769230769\n",
  511. "ave_fill_factor : 0.04239828192835043\n",
  512. "min_fill_factor : 0.009522961908152367\n",
  513. "max_fill_factor : 0.2222222222222222\n",
  514. "node_label_num : 37\n",
  515. "edge_label_num : 0\n",
  516. "node_attr_dim : 0\n",
  517. "edge_attr_dim : 0\n",
  518. "class_number : 2\n",
  519. "\n",
  520. "\n",
  521. "NCI109:\n",
  522. "substructures : {'non linear', 'linear'}\n",
  523. "node_labeled : True\n",
  524. "edge_labeled : False\n",
  525. "is_directed : False\n",
  526. "dataset_size : 4127\n",
  527. "ave_node_num : 29.681124303368065\n",
  528. "min_node_num : 4\n",
  529. "max_node_num : 111\n",
  530. "ave_edge_num : 32.13084565059365\n",
  531. "min_edge_num : 3\n",
  532. "max_edge_num : 119\n",
  533. "ave_node_degree : 2.156446168619097\n",
  534. "min_node_degree : 1.0909090909090908\n",
  535. "max_node_degree : 2.769230769230769\n",
  536. "ave_fill_factor : 0.04263668408405519\n",
  537. "min_fill_factor : 0.009522961908152367\n",
  538. "max_fill_factor : 0.1875\n",
  539. "node_label_num : 38\n",
  540. "edge_label_num : 0\n",
  541. "node_attr_dim : 0\n",
  542. "edge_attr_dim : 0\n",
  543. "class_number : 2\n",
  544. "\n",
  545. "load SDF: 100%|██████████| 4457424/4457424 [00:08<00:00, 522501.84it/s]\n",
  546. "ajust data: 100%|██████████| 42687/42687 [00:09<00:00, 4625.31it/s] \n",
  547. "\n",
  548. "NCI-HIV:\n",
  549. "substructures : {'non linear', 'linear'}\n",
  550. "node_labeled : True\n",
  551. "edge_labeled : True\n",
  552. "is_directed : False\n",
  553. "dataset_size : 42682\n",
  554. "ave_node_num : 45.70945597675835\n",
  555. "min_node_num : 2\n",
  556. "max_node_num : 438\n",
  557. "ave_edge_num : 47.7137903565906\n",
  558. "min_edge_num : 1\n",
  559. "max_edge_num : 441\n",
  560. "ave_node_degree : 2.087755727203458\n",
  561. "min_node_degree : 1.0\n",
  562. "max_node_degree : 4.571428571428571\n",
  563. "ave_fill_factor : 0.02739985514266206\n",
  564. "min_fill_factor : 0.002298742728466879\n",
  565. "max_fill_factor : 0.25\n",
  566. "node_label_num : 63\n",
  567. "edge_label_num : 3\n",
  568. "node_attr_dim : 0\n",
  569. "edge_attr_dim : 0\n",
  570. "class_number : 3\n",
  571. "\n"
  572. ]
  573. }
  574. ],
  575. "source": [
  576. "import sys\n",
  577. "sys.path.insert(0, \"../\")\n",
  578. "from pygraph.utils.graphfiles import loadDataset\n",
  579. "from pygraph.utils.graphdataset import get_dataset_attributes\n",
  580. "\n",
  581. "dslist = [\n",
  582. " {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',},\n",
  583. " {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds',\n",
  584. " 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt',},\n",
  585. " {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds',},\n",
  586. " {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds',},\n",
  587. " {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',\n",
  588. " 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}},\n",
  589. " {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},\n",
  590. " {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},\n",
  591. " {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},\n",
  592. " {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',\n",
  593. " 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}},\n",
  594. " {'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'},\n",
  595. " {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'},\n",
  596. " {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'},\n",
  597. " {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'},\n",
  598. " {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'},\n",
  599. " {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'},\n",
  600. " {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'},\n",
  601. " {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, \n",
  602. " {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'},\n",
  603. " {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, \n",
  604. " {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',\n",
  605. " 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},\n",
  606. " {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',\n",
  607. " 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}},\n",
  608. " {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',\n",
  609. " 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',},\n",
  610. "\n",
  611. "# # not working below\n",
  612. "# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},\n",
  613. "# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},\n",
  614. "# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},\n",
  615. "# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},\n",
  616. "]\n",
  617. "\n",
  618. "for ds in dslist:\n",
  619. " dataset, y = loadDataset(\n",
  620. " ds['dataset'],\n",
  621. " filename_y=(ds['dataset_y'] if 'dataset_y' in ds else None),\n",
  622. " extra_params=(ds['extra_params'] if 'extra_params' in ds else None))\n",
  623. " attrs = get_dataset_attributes(\n",
  624. " dataset, target=y, node_label='atom', edge_label='bond_type')\n",
  625. " print()\n",
  626. " print(ds['name'] + ':')\n",
  627. " for atr in attrs:\n",
  628. " print(atr, ':', attrs[atr])\n",
  629. " print()"
  630. ]
  631. }
  632. ],
  633. "metadata": {
  634. "kernelspec": {
  635. "display_name": "Python 3",
  636. "language": "python",
  637. "name": "python3"
  638. },
  639. "language_info": {
  640. "codemirror_mode": {
  641. "name": "ipython",
  642. "version": 3
  643. },
  644. "file_extension": ".py",
  645. "mimetype": "text/x-python",
  646. "name": "python",
  647. "nbconvert_exporter": "python",
  648. "pygments_lexer": "ipython3",
  649. "version": "3.6.7"
  650. }
  651. },
  652. "nbformat": 4,
  653. "nbformat_minor": 2
  654. }

A Python package for graph kernels, graph edit distances and graph pre-image problem.