You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ParaphraseDictionary.java 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. package paradict;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.IOException;
  6. import java.io.InputStreamReader;
  7. import java.util.ArrayList;
  8. import java.util.Collections;
  9. import java.util.HashMap;
  10. import java.util.HashSet;
  11. import java.util.Iterator;
  12. import nlp.tool.CoreNLP;
  13. import qa.Globals;
  14. public class ParaphraseDictionary {
  15. public static String localDataPath;
  16. public static String dbpedia_relation_paraphrases_baseform_withScore;
  17. public static String dbpedia_relation_paraphrases_baseform_withScore_rerank;
  18. public static String dbpedia_relation_paraphrases_handwrite;
  19. public static String dbpedia_predicate_id;
  20. public static String dbpedia_dbo_predicate;
  21. public HashMap<String, Integer> predicate_2_id = null;
  22. public HashMap<Integer, String> id_2_predicate = null;
  23. public HashSet<Integer> dbo_predicate_id = null;
  24. public HashMap<String, ArrayList<PredicateIDAndSupport>> nlPattern_2_predicateList = null;
  25. public HashMap<String, ArrayList<String>> invertedIndex = null;
  26. public HashSet<String> relns_subject;
  27. public HashSet<String> relns_object;
  28. public HashSet<String> prepositions;
  29. public HashSet<String> bannedTypes;
  30. //public final int typePredicateID = 1541; //dbpedia2015 <type>=1541
  31. public final int typePredicateID = 5157; //Dbpedia 2016 <type>=5166
  32. public int totalPredCount = 0;
  33. public int paraphrasedPredCount = 0;
  34. public int lineCount = 0;
  35. /**
  36. * constructor
  37. * @param parser
  38. * @param ner
  39. */
  40. public ParaphraseDictionary () {
  41. String fixedPath = Globals.localPath;
  42. System.out.println(System.getProperty("user.dir"));
  43. localDataPath = fixedPath + "data/DBpedia2016/parapharse/";
  44. dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt";
  45. dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt";
  46. dbpedia_predicate_id = localDataPath + "16predicate_id.txt";
  47. dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt";
  48. bannedTypes = new HashSet<String>();
  49. bannedTypes.add("Mayor");
  50. relns_subject = new HashSet<String>();
  51. relns_subject.add("subj");
  52. relns_subject.add("csubjpass");
  53. relns_subject.add("csubj");
  54. relns_subject.add("xsubj");
  55. relns_subject.add("nsubjpass");
  56. relns_subject.add("nsubj");
  57. relns_subject.add("poss"); // Obama's wife
  58. relns_subject.add("dobj");
  59. relns_object = new HashSet<String>();
  60. relns_object.add("dobj");
  61. relns_object.add("iobj");
  62. relns_object.add("obj");
  63. relns_object.add("pobj");
  64. prepositions = new HashSet<String>();
  65. prepositions.add("in");//in at on with to from before after of for
  66. prepositions.add("at");
  67. prepositions.add("on");
  68. prepositions.add("with");
  69. prepositions.add("to");
  70. prepositions.add("from");
  71. prepositions.add("before");
  72. prepositions.add("after");
  73. prepositions.add("of");
  74. prepositions.add("for");
  75. prepositions.add("as");
  76. try {
  77. loadPredicateId();
  78. loadDboPredicate();
  79. loadParaDict();
  80. buildInvertedIndex();
  81. typePredicateID = predicate_2_id.get("type");
  82. } catch (Exception e) {
  83. e.printStackTrace();
  84. }
  85. }
  86. /**
  87. * Load the mapping between predicates and their IDs.
  88. * @throws IOException
  89. */
  90. public void loadPredicateId () throws IOException {
  91. predicate_2_id = new HashMap<String, Integer>();
  92. id_2_predicate = new HashMap<Integer, String>();
  93. String input_filename = dbpedia_predicate_id;
  94. File file = new File(input_filename);
  95. InputStreamReader in = null;
  96. BufferedReader br = null;
  97. try{
  98. in = new InputStreamReader(new FileInputStream(file), "utf-8");
  99. br = new BufferedReader(in);
  100. String line = null;
  101. while ((line = br.readLine())!= null) {
  102. String[] lines = line.split("\t");
  103. predicate_2_id.put(lines[0], Integer.parseInt(lines[1]));
  104. id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]);
  105. }
  106. }catch(IOException e){
  107. System.out.println("NLPatterns.loadPredicateId() : IOException!");
  108. e.printStackTrace();
  109. }finally{
  110. if(br != null){
  111. try{
  112. br.close();
  113. }catch(IOException e){
  114. e.printStackTrace();
  115. }
  116. }
  117. }
  118. System.out.println("NLPatterns.loadPredicateId() : ok!");
  119. }
  120. public void loadDboPredicate() throws IOException
  121. {
  122. dbo_predicate_id = new HashSet<Integer>();
  123. int cnt = 0;
  124. String input_filename = dbpedia_dbo_predicate;
  125. InputStreamReader in = null;
  126. BufferedReader br = null;
  127. try{
  128. File file = new File(input_filename);
  129. in = new InputStreamReader(new FileInputStream(file), "utf-8");
  130. br = new BufferedReader(in);
  131. String line = null;
  132. while ((line = br.readLine())!= null)
  133. {
  134. if (!predicate_2_id.containsKey(line))
  135. {
  136. cnt++;
  137. //System.out.println("error: not found "+line+" id.");
  138. continue;
  139. }
  140. dbo_predicate_id.add(predicate_2_id.get(line));
  141. }
  142. }catch(IOException e){
  143. System.out.println("NLPatterns.loadDboPredicate() : IOException!");
  144. }finally{
  145. if(br!=null){
  146. try{
  147. br.close();
  148. }catch(IOException e){
  149. e.printStackTrace();
  150. }
  151. }
  152. }
  153. System.out.println("Warning: DBO not found id count: "+cnt);
  154. System.out.println("NLPatterns.loadDboPredicate() : ok!");
  155. }
  156. /**
  157. * Get predicate by its id
  158. * @param predicateID
  159. * @return
  160. */
  161. public String getPredicateById (int predicateID) {
  162. return id_2_predicate.get(predicateID);
  163. }
  164. public void loadParaDict () throws Exception {
  165. nlPattern_2_predicateList = new HashMap<String, ArrayList<PredicateIDAndSupport>>();
  166. HashSet<String> missInDBP2014 = new HashSet<String>();
  167. InputStreamReader in = null;
  168. BufferedReader br = null;
  169. try{
  170. String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank;
  171. File file = new File(inputFileName);
  172. in = new InputStreamReader(new FileInputStream(file), "utf-8");
  173. br = new BufferedReader(in);
  174. String line = null;
  175. int lineCount = 0;
  176. //line = br.readLine();//read the first line which indicates the format
  177. while ((line = br.readLine()) != null)
  178. {
  179. if (line.startsWith("#")) continue;
  180. lineCount ++;
  181. String[] content = line.split("\t");
  182. if(!predicate_2_id.containsKey(content[0]))
  183. {
  184. missInDBP2014.add(content[0]);
  185. continue;
  186. }
  187. int predicateID = predicate_2_id.get(content[0]);
  188. String nlPattern = content[1].toLowerCase();
  189. int support = Integer.parseInt(content[2]);
  190. //double score = Double.parseDouble(content[3]);
  191. String []slctString = content[3].split(" ");
  192. double[] slct = new double[slctString.length];
  193. for (int i=0; i < slct.length; i++) {
  194. slct[i] = Double.parseDouble(slctString[i]);
  195. }
  196. if (!nlPattern_2_predicateList.containsKey(nlPattern)) {
  197. nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>());
  198. }
  199. nlPattern_2_predicateList.get(nlPattern).add(new PredicateIDAndSupport(predicateID, support, slct));
  200. }
  201. System.out.println("Number of NL-Patterns-to-predicate mappings = " + lineCount);
  202. System.out.println("NLPatterns.size = " + nlPattern_2_predicateList.size());
  203. System.out.println("Predicate.size = " + predicate_2_id.size());
  204. System.out.println("Warning: Predicates not in DBpedia 2014 count: "+missInDBP2014.size());
  205. // Notice predicate itself and handwritten patterns have no wordSelectivity.
  206. addPredicateAsNLPattern(); // This is very important.
  207. addHandwriteAsNLPattern();
  208. Iterator<String> it = nlPattern_2_predicateList.keySet().iterator();
  209. while (it.hasNext()) {
  210. Collections.sort(nlPattern_2_predicateList.get(it.next()));
  211. }
  212. }catch(IOException e){
  213. System.out.println("NLPatterns.Paradict() : IOException!");
  214. }finally{
  215. if(br!=null){
  216. try{
  217. br.close();
  218. }catch(IOException e){
  219. e.printStackTrace();
  220. }
  221. }
  222. }
  223. System.out.println("NLPatterns.Paradict() : ok!");
  224. }
  225. /**
  226. * A set of very important NL patterns are the predicates themselves!
  227. */
  228. public void addPredicateAsNLPattern () {
  229. final int support = 200;
  230. int predicate_id;
  231. for (String p : predicate_2_id.keySet())
  232. {
  233. // TODO: Omitting some bad relations (should be discarded in future)
  234. if(p.equals("state") || p.equals("states"))
  235. continue;
  236. predicate_id = predicate_2_id.get(p);
  237. StringBuilder pattern = new StringBuilder("");
  238. // Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern
  239. if(p.contains("/"))
  240. {
  241. if(p.charAt(0)>='A' && p.charAt(0)<='Z')
  242. p = p.substring(p.indexOf("/")+1);
  243. //gameW/l 1974
  244. else
  245. p = p.replace("/", "");
  246. }
  247. int last = 0, i = 0;
  248. for(i = 0; i < p.length(); i ++) {
  249. // if it were not a small letter, then break it.
  250. if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) {
  251. pattern.append(p.substring(last, i).toLowerCase());
  252. pattern.append(" ");
  253. last = i;
  254. }
  255. }
  256. pattern.append(p.substring(last, i).toLowerCase());
  257. for (i = 3; i < pattern.length(); i ++) {
  258. // the blank between two digits should be deleted.
  259. if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9'
  260. && pattern.charAt(i-1)==' '
  261. && pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') {
  262. pattern.deleteCharAt(i-1);
  263. }
  264. // the blank between I and D should be deleted.
  265. else if (pattern.charAt(i)=='d'
  266. && pattern.charAt(i-1)==' '
  267. && pattern.charAt(i-2)=='i'
  268. && pattern.charAt(i-3)==' ') {
  269. pattern.deleteCharAt(i-1);
  270. }
  271. // the blank between D and B should be deleted.
  272. else if (pattern.charAt(i)=='b'
  273. && pattern.charAt(i-1)==' '
  274. && pattern.charAt(i-2)=='d'
  275. && pattern.charAt(i-3)==' ') {
  276. pattern.deleteCharAt(i-1);
  277. }
  278. }
  279. // pattern -> base form
  280. /*String[] ptns = pattern.toString().split(" ");
  281. pattern = new StringBuilder("");
  282. for (String s : ptns) {
  283. pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s));
  284. pattern.append(" ");
  285. }
  286. pattern.deleteCharAt(pattern.length()-1);
  287. String patternString = pattern.toString();*/
  288. // Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm
  289. String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString());
  290. //System.out.println(p + "-->" + patternString);
  291. if (!nlPattern_2_predicateList.containsKey(patternString)) {
  292. nlPattern_2_predicateList.put(patternString, new ArrayList<PredicateIDAndSupport>());
  293. }
  294. nlPattern_2_predicateList.get(patternString).add(
  295. new PredicateIDAndSupport(predicate_id,
  296. support,
  297. PredicateIDAndSupport.genSlct(patternString.split(" ").length)));
  298. }
  299. System.out.println("NLPatterns.addPredicateAsNLPattern(): ok!");
  300. }
  301. public void addHandwriteAsNLPattern() throws IOException {
  302. String inputFileName = dbpedia_relation_paraphrases_handwrite;
  303. InputStreamReader in = null;
  304. BufferedReader br = null;
  305. try{
  306. File file = new File(inputFileName);
  307. in = new InputStreamReader(new FileInputStream(file), "utf-8");
  308. br = new BufferedReader(in);
  309. String line = null;
  310. //int lineCount = 0;
  311. //line = br.readLine();//read the first line which indicates the format
  312. while ((line = br.readLine()) != null) {
  313. if (line.startsWith("#") || line.isEmpty()) continue;
  314. //lineCount ++;
  315. String[] content = line.split("\t");
  316. if(!predicate_2_id.containsKey(content[0]))
  317. continue;
  318. int predicateID = predicate_2_id.get(content[0]);
  319. String nlPattern = content[1].toLowerCase();
  320. int support = Integer.parseInt(content[2]);
  321. if (!nlPattern_2_predicateList.containsKey(nlPattern)) {
  322. nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>());
  323. }
  324. nlPattern_2_predicateList.get(nlPattern).add(
  325. new PredicateIDAndSupport(predicateID,
  326. support,
  327. PredicateIDAndSupport.genSlct(nlPattern.split(" ").length)));
  328. }
  329. }catch(IOException e){
  330. System.out.println("NLPatterns.addHandwriteAsNLPattern(): IOException!");
  331. }finally{
  332. if(br!=null){
  333. try{
  334. br.close();
  335. }catch(IOException e){
  336. e.printStackTrace();
  337. }
  338. }
  339. }
  340. System.out.println("NLPatterns.addHandwriteAsNLPattern(): ok!");
  341. }
  342. /**
  343. * Show the NLPatterns
  344. */
  345. public void showNLPatterns () {
  346. /*for (String s: syntacticMarker) {
  347. System.out.println(s);
  348. }
  349. GlobalTools.systemPause();*/
  350. System.out.println("predicate-->id");
  351. for (String s : predicate_2_id.keySet()) {
  352. System.out.println(s + "-->" + predicate_2_id.get(s));
  353. }
  354. Globals.systemPause();
  355. int count = 1;
  356. System.out.println("nlPattern-->predicate<support>");
  357. for (String p : nlPattern_2_predicateList.keySet()) {
  358. System.out.print("" + (count++) + ".\t" + p + "\t[" + nlPattern_2_predicateList.get(p).size() + "]\t");
  359. for (PredicateIDAndSupport i : nlPattern_2_predicateList.get(p)) {
  360. System.out.print(id_2_predicate.get(i.predicateID) + "<" + i.support + ">" + ", ");
  361. }
  362. System.out.println();
  363. }
  364. }
  365. /**
  366. * Build the inverted index, where each word will be mapped to the patterns that it occurs
  367. */
  368. public void buildInvertedIndex () {
  369. invertedIndex = new HashMap<String, ArrayList<String>>();
  370. // traversing all patterns
  371. for (String p : nlPattern_2_predicateList.keySet()) {
  372. String[] tokens = p.split(" ");
  373. for (String token : tokens) {
  374. if (token.length() < 1) continue;
  375. if (!invertedIndex.containsKey(token)) {
  376. invertedIndex.put(token, new ArrayList<String>());
  377. }
  378. invertedIndex.get(token).add(p);
  379. }
  380. }
  381. System.out.println("NLPatterns.buildInvertedIndex(): ok!");
  382. }
  383. public static void main (String[] args) {
  384. Globals.coreNLP = new CoreNLP();
  385. Globals.pd = new ParaphraseDictionary();
  386. //Globals.pd.showNLPatterns();
  387. }
  388. }

GAnswer system is a natural language QA system developed by Institute of Computer Science & Techonology Data Management Lab, Peking University, led by Prof. Zou Lei. GAnswer is able to translate natural language questions to query graphs containing semant