You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

step5_get_entity_fragment.py 2.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #encoding=utf-8
  2. inEnEdge = {}
  3. outEnEdge = {}
  4. inEdge={}
  5. outEdge = {}
  6. types = {}
  7. with open('triple file represented by ids here','r') as f:
  8. i = 1
  9. for line in f:
  10. tri = line[:-1].split('\t')
  11. if tri[1] == 'id of <type>' and tri[2]!='-1':
  12. if types.has_key(tri[0]):
  13. types[tri[0]].add(tri[2])
  14. else:
  15. types[tri[0]] = set()
  16. types[tri[0]].add(tri[2])
  17. else:
  18. if outEdge.has_key(tri[0]):
  19. outEdge[tri[0]].add(tri[1])
  20. else:
  21. outEdge[tri[0]] = set()
  22. outEdge[tri[0]].add(tri[1])
  23. if tri[2]!='-1':
  24. if outEnEdge.has_key(tri[0]):
  25. if outEnEdge[tri[0]].has_key(tri[2]):
  26. outEnEdge[tri[0]][tri[2]].add(tri[1])
  27. else:
  28. outEnEdge[tri[0]][tri[2]] = set()
  29. outEnEdge[tri[0]][tri[2]].add(tri[1])
  30. else:
  31. outEnEdge[tri[0]]={}
  32. outEnEdge[tri[0]][tri[2]] = set()
  33. outEnEdge[tri[0]][tri[2]].add(tri[1])
  34. if inEdge.has_key(tri[2]):
  35. inEdge[tri[2]].add(tri[1])
  36. else:
  37. inEdge[tri[2]] = set()
  38. inEdge[tri[2]].add(tri[1])
  39. if inEnEdge.has_key(tri[2]):
  40. if inEnEdge[tri[2]].has_key(tri[0]):
  41. inEnEdge[tri[2]][tri[0]].add(tri[1])
  42. else:
  43. inEnEdge[tri[2]][tri[0]] = set()
  44. inEnEdge[tri[2]][tri[0]].add(tri[1])
  45. else:
  46. inEnEdge[tri[2]] = {}
  47. inEnEdge[tri[2]][tri[0]] = set()
  48. inEnEdge[tri[2]][tri[0]].add(tri[1])
  49. if i%10000 == 0:
  50. print(i)
  51. i += 1
  52. print(len(inEnEdge))
  53. print(len(outEnEdge))
  54. print(len(inEdge))
  55. print(len(outEdge))
  56. print(len(types))
  57. wr = open('output fragment file','w')
  58. for i in range(12301050):#here we should iterate every entitiy
  59. if i%10000 == 0:
  60. print(i)
  61. eid = "%d"%i
  62. ret = ""
  63. tmp = ""
  64. if inEnEdge.has_key(eid):
  65. tmp = ""
  66. for k in inEnEdge[eid].keys():
  67. tmp += k
  68. tmp += ':'
  69. for item in inEnEdge[eid][k]:
  70. if item == '-1':
  71. continue
  72. tmp += item + ';'
  73. tmp += ','
  74. ret += tmp
  75. tmp = ""
  76. ret += '|'
  77. if outEnEdge.has_key(eid):
  78. tmp = ""
  79. for k in outEnEdge[eid].keys():
  80. tmp += k
  81. tmp += ':'
  82. for item in outEnEdge[eid][k]:
  83. if item == '-1':
  84. continue
  85. tmp += item + ';'
  86. tmp += ','
  87. ret += tmp
  88. tmp = ""
  89. ret += '|'
  90. if inEdge.has_key(eid):
  91. tmp = ""
  92. for item in inEdge[eid]:
  93. if item == '-1':
  94. continue
  95. tmp += item + ','
  96. ret += tmp
  97. tmp=""
  98. ret += '|'
  99. if outEdge.has_key(eid):
  100. tmp = ""
  101. for item in outEdge[eid]:
  102. if item == '-1':
  103. continue
  104. tmp += item + ','
  105. ret += tmp
  106. tmp=""
  107. ret += '|'
  108. if types.has_key(eid):
  109. tmp = ""
  110. for item in types[eid]:
  111. if item == '-1':
  112. continue
  113. tmp += item + ','
  114. ret += tmp
  115. tmp=""
  116. wr.write("%s\t%s\n"%(eid,ret))

GAnswer system is a natural language QA system developed by Institute of Computer Science & Techonology Data Management Lab, Peking University, led by Prof. Zou Lei. GAnswer is able to translate natural language questions to query graphs containing semant