You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

step1_clean_triple.py 1.9 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import re
  2. '''
  3. Step 1: Clean the triple file. In the dbpedia case, we just need the part of resource URI that indicate entity/type/predicate names.
  4. '''
  5. fileName = []#List of triple files to be process
  6. notRdf = open('./notRdf.txt','w')#Record the lines that refers to a type but not rdf:type
  7. for index2,fname in enumerate(fileName):
  8. f = open('./'+fname)
  9. triple = open('output triple files here','w')
  10. prefix_f = open('output prefix files here','w')# save the prefix in files in case of it may be useful in the future.
  11. i = 0
  12. count = 0
  13. prefix_set = set()
  14. for line in f:
  15. if line[0] != '<':
  16. print(i)
  17. i = i + 1
  18. count += 1
  19. continue
  20. line = line[:-3].replace('> <','>$-$-$<').replace('> "','>$-$-$"')
  21. line = line.split('$-$-$')
  22. if i==0:
  23. i += 1
  24. continue
  25. new_line=[]
  26. if "type>" in line[1]:
  27. if "rdf" not in line[1]:
  28. notRdf.write(str(line)+'\n')
  29. continue
  30. for index,item in enumerate(line):
  31. if not item:
  32. count +=1
  33. break
  34. if item[0]=='<':
  35. pos = item.rfind('/')
  36. word = item[pos+1:-1].split("#")
  37. if len(word)<2:
  38. new_line.append('<'+word[0]+'>')
  39. else:
  40. new_line.append('<'+word[1]+'>')
  41. if index == 1:
  42. tmp = new_line[1][1:len(new_line[1])-1]
  43. pos2 = line[1].rfind(tmp)
  44. prefix = line[1][1:pos2-1]
  45. prefix_set.add(tmp + '^^^'+prefix+'\n')
  46. continue
  47. elif item.count('"') >=2:
  48. item = item.split('^^')[0].split('@')[0]
  49. pattern = re.compile('"(.*)"')
  50. word = '"'+''.join(pattern.findall(item))+'"'
  51. new_line.append(word)
  52. continue
  53. else:
  54. print(i)
  55. i += 1
  56. #print('\t'.join(new_line))
  57. if i%1000000==0:
  58. print("%d:%d"%(8,i))
  59. triple.write('\t'.join(new_line)+'\n')
  60. for item in prefix_set:
  61. prefix_f.write(item)
  62. f.close()
  63. triple.close()
  64. prefix_f.close()

GAnswer system is a natural language QA system developed by Institute of Computer Science & Techonology Data Management Lab, Peking University, led by Prof. Zou Lei. GAnswer is able to translate natural language questions to query graphs containing semant