Browse Source

First commit to support Chinese QA

This is version 0.5, although it can works for Chinese QA now, it still
remains a lot of work to further improve.
pkubase
knightmarehs 6 years ago
parent
commit
f1ef5b06cc
29 changed files with 990 additions and 2537 deletions
  1. +7
    -86
      src/addition/AddtionalFix.java
  2. +4
    -2
      src/fgmt/RelationFragment.java
  3. +4
    -41
      src/fgmt/TypeFragment.java
  4. +0
    -119
      src/lcn/BuildIndexForEntityFragments.java
  5. +0
    -107
      src/lcn/BuildIndexForTypeShortName.java
  6. +44
    -3
      src/lcn/EntityFragmentFields.java
  7. +1
    -5
      src/log/QueryLogger.java
  8. +14
    -181
      src/nlp/ds/DependencyTree.java
  9. +49
    -25
      src/nlp/ds/Sentence.java
  10. +0
    -201
      src/nlp/tool/CoreNLP.java
  11. +1
    -4
      src/nlp/tool/Main.java
  12. +0
    -70
      src/nlp/tool/MaltParser.java
  13. +0
    -73
      src/nlp/tool/MaltParserCon.java
  14. +0
    -53
      src/nlp/tool/NERecognizer.java
  15. +28
    -28
      src/nlp/tool/StanfordParser.java
  16. +46
    -109
      src/paradict/ParaphraseDictionary.java
  17. +18
    -12
      src/qa/GAnswer.java
  18. +12
    -36
      src/qa/Globals.java
  19. +32
    -35
      src/qa/Query.java
  20. +0
    -864
      src/qa/extract/EntityRecognition.java
  21. +566
    -0
      src/qa/extract/EntityRecognitionCh.java
  22. +1
    -2
      src/qa/extract/ExtractImplicitRelation.java
  23. +0
    -2
      src/qa/extract/ExtractRelation.java
  24. +39
    -47
      src/qa/extract/TypeRecognition.java
  25. +0
    -163
      src/qa/mapping/DBpediaLookup.java
  26. +112
    -197
      src/qa/parsing/BuildQueryGraph.java
  27. +11
    -30
      src/qa/parsing/QuestionParsing.java
  28. +0
    -41
      src/rdf/MergedWord.java
  29. +1
    -1
      src/rdf/SimpleRelation.java

+ 7
- 86
src/addition/AddtionalFix.java View File

@@ -20,16 +20,10 @@ public class AddtionalFix
public AddtionalFix()
{
// Some category mappings for DBpedia, try automatic linking methods later. | base form
pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters");
pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World");
pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus");
pattern2category.put("13_british_colony", "Thirteen_Colonies");
}
public void process(QueryLogger qlog)
{
fixCategory(qlog);
oneTriple(qlog);
oneNode(qlog);
@@ -48,45 +42,10 @@ public class AddtionalFix
spq.queryType = QueryType.Ask;
}
public void fixCategory(QueryLogger qlog)
{
if(qlog == null || qlog.semanticUnitList == null)
return;
String var = null, category = null;
for(SemanticUnit su: qlog.semanticUnitList)
{
if(su.centerWord.mayCategory)
{
var = "?"+su.centerWord.originalForm;
category = su.centerWord.category;
}
}
if(category != null && var != null)
for(Sparql spq: qlog.rankedSparqls)
{
boolean occured = false;
for(Triple tri: spq.tripleList)
{
if(tri.subject.equals(var))
{
occured = true;
break;
}
}
String oName = category;
String pName = "subject";
int pid = Globals.pd.predicate_2_id.get(pName);
Triple triple = new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100);
spq.addTriple(triple);
}
}
/* recognize one-Node query
* Two cases:1、Special question|Imperative sentence 2、General question
* 1-1:how many [], highest [] ... | For single variable, add constraint (aggregation)
* 1-2: What is backgammon? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...)
* 1-2: 谁是狄仁杰? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...)
* 1-3: Give me all Seven Wonders of the Ancient World. | Notice, "Seven Wonders of the Ancient World" should be recognized as ENT before. (in fact it is CATEGORY in DBpeida)
* 2-1: Are there any [castles_in_the_United_States](yago:type)
* 2-2:Was Sigmund Freud married? | Lack of variable node.
@@ -101,7 +60,7 @@ public class AddtionalFix
Word[] words = qlog.s.words;
if(qlog.s.sentenceType != SentenceType.GeneralQuestion)
{
//1-1: how many [type] are there | List all [type]
//1-1: 有多少[type] | 列出所有[type]
if(target.mayType && target.tmList != null)
{
String subName = "?"+target.originalForm;
@@ -111,10 +70,10 @@ public class AddtionalFix
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
//1-2: What is [ent]?
else if(target.mayEnt && target.emList != null)
{
if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be"))
//1-2: 什么是[ent]
if(words.length >= 3 && (words[0].baseForm.equals("什么") || words[0].baseForm.equals("谁")) && words[1].baseForm.equals("是"))
{
int eid = target.emList.get(0).entityID;
String subName = target.emList.get(0).entityName;
@@ -123,24 +82,14 @@ public class AddtionalFix
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
//1-3: Give me all Seven Wonders of the Ancient World.
else if(target.mayCategory && target.category != null)
{
String oName = target.category;
String pName = "subject";
int pid = Globals.pd.predicate_2_id.get(pName);
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
//1-3: [ent] with other relations
}
}
else
else
{
if(target.mayEnt && target.emList != null)
{
//2-2:Was Sigmund Freud married?
//2-2:[ent]结婚了吗?
String relMention = "";
for(Word word: words)
if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?"))
@@ -162,34 +111,6 @@ public class AddtionalFix
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
//2-3:Are penguins endangered?
else
{
if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm))
{
String oName = pattern2category.get(words[target.position].baseForm);
String pName = "subject";
int pid = Globals.pd.predicate_2_id.get(pName);
int eid = target.emList.get(0).entityID;
String subName = target.emList.get(0).entityName;
Triple triple = new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
}
//2-1: Are there any [castles_in_the_United_States](yago:type)
else if(target.mayType && target.tmList != null)
{
String typeName = target.tmList.get(0).typeName;
String subName = "?" + target.originalForm;
//System.out.println("typeName="+typeName+" subName="+subName);
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100);
Sparql sparql = new Sparql();
sparql.addTriple(triple);
qlog.rankedSparqls.add(sparql);
}
}
}


+ 4
- 2
src/fgmt/RelationFragment.java View File

@@ -46,7 +46,9 @@ public class RelationFragment extends Fragment
public static void load() throws Exception
{
String filename = Globals.localPath + "data/DBpedia2016/fragments/predicate_RDF_fragment/predicate_fragment.txt";
System.out.println("Loading relation IDs and Fragments ...");
String filename = Globals.localPath + "data/pkubase/fragments/pkubase_predicate_fragment.txt";
List<String> inputs = FileUtil.readFile(filename);
relFragments = new HashMap<Integer, ArrayList<RelationFragment>>();
literalRelationSet = new HashSet<Integer>();
@@ -72,7 +74,7 @@ public class RelationFragment extends Fragment
public static void loadId() throws IOException
{
String filename = Globals.localPath + "data/DBpedia2016/fragments/id_mappings/16predicate_id.txt";
String filename = Globals.localPath + "data/pkubase/fragments/id_mappings/pkubase_predicate_id.txt";
List<String> inputs = FileUtil.readFile(filename);
relationShortName2IdList = new HashMap<String, ArrayList<Integer>>();



+ 4
- 41
src/fgmt/TypeFragment.java View File

@@ -19,8 +19,6 @@ public class TypeFragment extends Fragment {
public static HashMap<Integer, String> typeId2ShortName = null;
public static final int NO_RELATION = -24232;
public static HashSet<String> yagoTypeList = null;
public HashSet<Integer> inEdges = new HashSet<Integer>();
public HashSet<Integer> outEdges = new HashSet<Integer>();
public HashSet<Integer> entSet = new HashSet<Integer>();
@@ -33,26 +31,6 @@ public class TypeFragment extends Fragment {
* 4, others: peace、vice
*/
public static ArrayList<String> stopYagoTypeList = null;
static void loadStopYagoTypeList()
{
stopYagoTypeList = new ArrayList<String>();
stopYagoTypeList.add("Amazon");
stopYagoTypeList.add("Earth");
stopYagoTypeList.add("TheHungerGames");
stopYagoTypeList.add("SparklingWine");
stopYagoTypeList.add("Type");
stopYagoTypeList.add("Flow");
stopYagoTypeList.add("Owner");
stopYagoTypeList.add("Series");
stopYagoTypeList.add("Shot");
stopYagoTypeList.add("Part");
stopYagoTypeList.add("Care");
stopYagoTypeList.add("Peace");
stopYagoTypeList.add("Vice");
stopYagoTypeList.add("Dodo");
stopYagoTypeList.add("CzechFilms");
stopYagoTypeList.add("ChineseFilms");
}
public TypeFragment(String fgmt, int fid)
{
@@ -100,7 +78,7 @@ public class TypeFragment extends Fragment {
public static void load() throws Exception
{
String filename = Globals.localPath+"data/DBpedia2016/fragments/class_RDF_fragment/16type_fragment.txt";
String filename = Globals.localPath+"data/pkubase/fragments/pkubase_type_fragment.txt";
File file = new File(filename);
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8");
@@ -128,14 +106,13 @@ public class TypeFragment extends Fragment {
// can fix some data there
// load Type Id
loadId();
System.out.println("Load "+typeId2ShortName.size()+" basic types and "+yagoTypeList.size()+" yago types.");
System.out.println("Load "+typeId2ShortName.size()+" basic types.");
}
public static void loadId() throws IOException
{
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16basic_types_id.txt";
String yagoFileName = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16yago_types_list.txt";

String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkubase_type_id.txt";
File file = new File(filename);
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8");
BufferedReader br = new BufferedReader(in);
@@ -161,19 +138,5 @@ public class TypeFragment extends Fragment {
typeId2ShortName.put(RelationFragment.literalTypeId, "literal_HRZ");
br.close();
//load YAGO types
in = new InputStreamReader(new FileInputStream(yagoFileName),"utf-8");
br = new BufferedReader(in);
yagoTypeList = new HashSet<String>();
while((line = br.readLine())!=null)
{
String[] lines = line.split("\t");
String typeName = lines[0];
yagoTypeList.add(typeName);
}
loadStopYagoTypeList();
yagoTypeList.removeAll(stopYagoTypeList);
}
}

+ 0
- 119
src/lcn/BuildIndexForEntityFragments.java View File

@@ -1,119 +0,0 @@
package lcn;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

import qa.Globals;


public class BuildIndexForEntityFragments{
public void indexforentity() throws Exception
{
if(EntityFragmentFields.entityId2Name == null)
EntityFragmentFields.load();
long startTime = new Date().getTime();
//Try update KB index to DBpedia2015. by husen 2016-04-08
//Try update KB index to DBpedia2016. by husen 2018-8-22
File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index");
File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt");
Analyzer luceneAnalyzer_en = new StandardAnalyzer();
IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true);
int mergeFactor = 100000; //default 10
int maxBufferedDoc = 1000; //default 10
int maxMergeDoc = Integer.MAX_VALUE; //INF
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
indexWriter_en.setMergeFactor(mergeFactor);
indexWriter_en.setMaxBufferedDocs(maxBufferedDoc);
indexWriter_en.setMaxMergeDocs(maxMergeDoc);
FileInputStream file = new FileInputStream(sourceDir_en);
InputStreamReader in = new InputStreamReader(file,"UTF-8");
BufferedReader br = new BufferedReader(in);
int count = 0;
while(true)
{
String _line = br.readLine();
{
if(_line == null) break;
}
count++;
if(count % 100000 == 0)
System.out.println(count);
String line = _line;
String temp[] = line.split("\t");
if(temp.length != 2)
continue;
else
{
int entity_id = Integer.parseInt(temp[0]);
if(!EntityFragmentFields.entityId2Name.containsKey(entity_id))
continue;
String entity_name = EntityFragmentFields.entityId2Name.get(entity_id);
String entity_fragment = temp[1];
entity_name = entity_name.replace("____", " ");
entity_name = entity_name.replace("__", " ");
entity_name = entity_name.replace("_", " ");
Document document = new Document();
Field EntityName = new Field("EntityName", entity_name, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
Field EntityId = new Field("EntityId", String.valueOf(entity_id),
Field.Store.YES, Field.Index.NO);
Field EntityFragment = new Field("EntityFragment", entity_fragment,
Field.Store.YES, Field.Index.NO);
document.add(EntityName);
document.add(EntityId);
document.add(EntityFragment);
indexWriter_en.addDocument(document);
}
}
indexWriter_en.optimize();
indexWriter_en.close();
br.close();

// input the time of Build index
long endTime = new Date().getTime();
System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
public static void main(String[] args)
{
BuildIndexForEntityFragments bef = new BuildIndexForEntityFragments();
try
{
Globals.localPath="D:/husen/gAnswer/";
bef.indexforentity();
}
catch (Exception e)
{
e.printStackTrace();
}
}
}



+ 0
- 107
src/lcn/BuildIndexForTypeShortName.java View File

@@ -1,107 +0,0 @@
package lcn;

import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;

import qa.Globals;
import fgmt.TypeFragment;

public class BuildIndexForTypeShortName {
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception
{
long startTime = new Date().getTime();
File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index");
Analyzer luceneAnalyzer_li = new StandardAnalyzer();
IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true);
int mergeFactor = 100000;
int maxBufferedDoc = 1000;
int maxMergeDoc = Integer.MAX_VALUE;
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor;
indexWriter_li.setMergeFactor(mergeFactor);
indexWriter_li.setMaxBufferedDocs(maxBufferedDoc);
indexWriter_li.setMaxMergeDocs(maxMergeDoc);
int count = 0;
Iterator<String> it = typeShortName2IdList.keySet().iterator();
while (it.hasNext())
{
String sn = it.next();
if (sn.length() == 0) {
continue;
}
count ++;
StringBuilder splittedSn = new StringBuilder("");
if(sn.contains("_"))
{
String nsn = sn.replace("_", " ");
splittedSn.append(nsn.toLowerCase());
}
else
{
int last = 0, i = 0;
for(i = 0; i < sn.length(); i ++)
{
// if it were not a small letter, then break it.
if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z'))
{
splittedSn.append(sn.substring(last, i).toLowerCase());
splittedSn.append(' ');
last = i;
}
}
splittedSn.append(sn.substring(last, i).toLowerCase());
while(splittedSn.charAt(0) == ' ') {
splittedSn.deleteCharAt(0);
}
}
System.out.println("SplitttedType: "+splittedSn);
Document document = new Document();

Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(),
Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS);
Field TypeShortName = new Field("TypeShortName", sn,
Field.Store.YES, Field.Index.NO);
document.add(SplittedTypeShortName);
document.add(TypeShortName);
indexWriter_li.addDocument(document);
}
indexWriter_li.optimize();
indexWriter_li.close();

// input the time of Build index
long endTime = new Date().getTime();
System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime));
}
public static void main (String[] args) {
try {
Globals.localPath="D:/husen/gAnswer/";
TypeFragment.load();
BuildIndexForTypeShortName.buildIndex(TypeFragment.typeShortName2IdList);
} catch (Exception e) {
e.printStackTrace();
}
}

}

+ 44
- 3
src/lcn/EntityFragmentFields.java View File

@@ -5,9 +5,13 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import fgmt.EntityFragment;
import qa.Globals;
import utils.FileUtil;

public class EntityFragmentFields {
@@ -18,8 +22,8 @@ public class EntityFragmentFields {
public static void load() throws IOException
{
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16entity_id.txt";
String fragmentFileName = Globals.localPath+"data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt";
String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkubase_entity_id.txt";
String fragmentFileName = Globals.localPath+"data/pkubase/fragments/pkubase_entity_fragment.txt";
File file = new File(filename);
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8"));

@@ -35,7 +39,7 @@ public class EntityFragmentFields {
while((line = br.readLine()) != null)
{
String[] lines = line.split("\t");
String entName = lines[0].substring(1, lines[0].length()-1);
String entName = lines[0].trim().substring(1, lines[0].length()-1);
entityName2Id.put(entName, Integer.parseInt(lines[1]));
entityId2Name.put(Integer.parseInt(lines[1]), entName);
@@ -61,4 +65,41 @@ public class EntityFragmentFields {
br.close();
}
public static void genmini()
{
String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkuentity_id.txt";
String fragmentFileName = Globals.localPath+"data/pkubase/fragments/pkubase_entity_fragment_mini.txt";
List<String> fragments = FileUtil.readFile(fragmentFileName);
ArrayList<Integer> eids = new ArrayList<Integer>();
for(String fragment: fragments)
{
int eid = Integer.parseInt(fragment.split("\t")[0]);
String fgmt = fragment.split("\t")[1];
EntityFragment ef = new EntityFragment(eid, fgmt);
eids.add(eid);
for(int ent: ef.inEntMap.keySet())
{
eids.add(ent);
}
for(int ent: ef.outEntMap.keySet())
{
eids.add(ent);
}
}
System.out.println(eids.size());
System.out.println("Loading entity id ...");
List<String> data = FileUtil.readFile(filename);
for(String line: data)
{
String[] lines = line.split("\t");
int eid = Integer.parseInt(lines[1]);
if(eids.contains(eid))
System.out.println(line);
}
}
public static void main(String[] args) {
EntityFragmentFields.genmini();
}
}

+ 1
- 5
src/log/QueryLogger.java View File

@@ -12,7 +12,6 @@ import qa.Query;
import rdf.EntityMapping;
import rdf.SemanticRelation;
import rdf.Sparql;
import rdf.MergedWord;
import rdf.SemanticUnit;
import qa.Answer;
import nlp.ds.Sentence;
@@ -30,10 +29,8 @@ public class QueryLogger {
public boolean MODE_debug = false;
public boolean MODE_log = true;
public boolean MODE_fragment = true;
public boolean isMaltParserUsed = true; // Notice, we utilize Malt Parser as default parser, which is different from the older version. TODO: some coref rules need changed to fit Malt Parser.
public boolean isMaltParserUsed = false; // MaltParser id deprecated.
public HashMap<String, Integer> timeTable = null;
public ArrayList<MergedWord> mWordList = null;
public ArrayList<SemanticUnit> semanticUnitList = null;
public HashMap<Integer, SemanticRelation> semanticRelations = null;
public HashMap<Integer, SemanticRelation> potentialSemanticRelations = null;
@@ -48,7 +45,6 @@ public class QueryLogger {
{
timeTable = new HashMap<String, Integer>();
rankedSparqls = new ArrayList<Sparql>();
mWordList = query.mWordList;
}
public void reloadSentence(Sentence sentence)


+ 14
- 181
src/nlp/ds/DependencyTree.java View File

@@ -6,75 +6,37 @@ import java.util.HashMap;
import java.util.List;
import java.util.Stack;

import nlp.tool.CoreNLP;
import nlp.tool.MaltParser;
import nlp.tool.StanfordParser;

import org.maltparser.core.exception.MaltChainedException;
import org.maltparser.core.syntaxgraph.DependencyStructure;
import org.maltparser.core.syntaxgraph.node.DependencyNode;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.trees.GrammaticalStructure;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;

public class DependencyTree {
public DependencyTreeNode root = null;
public ArrayList<DependencyTreeNode> nodesList = null;
public SemanticGraph dependencies = null; // Method 1: CoreNLP (discarded)
public GrammaticalStructure gs = null; // Method 2: Stanford Parser
public DependencyStructure maltGraph = null; // Method 3: MaltParser
// public GrammaticalStructure gs = null; // Method 2: Stanford Parser
public HashMap<String, ArrayList<DependencyTreeNode>> wordBaseFormIndex = null;
public DependencyTree (Sentence sentence, CoreNLP coreNLPparser) {
SemanticGraph dependencies = coreNLPparser.getBasicDependencies(sentence.plainText);
this.dependencies = dependencies;
Stack<IndexedWord> stack = new Stack<IndexedWord>();
IndexedWord iwRoot = dependencies.getFirstRoot();
HashMap<IndexedWord, DependencyTreeNode> map = new HashMap<IndexedWord, DependencyTreeNode>();
nodesList = new ArrayList<DependencyTreeNode>();

stack.push(iwRoot);
root = this.setRoot(sentence.getWordByIndex(iwRoot.index()));
map.put(iwRoot, root);

while (!stack.empty())
{
IndexedWord curIWNode = stack.pop();
DependencyTreeNode curDTNode = map.get(curIWNode);
for (IndexedWord iwChild : dependencies.getChildList(curIWNode)) {
Word w = sentence.getWordByIndex(iwChild.index());
DependencyTreeNode newDTNode = this.insert(
curDTNode,
w,
dependencies.reln(curIWNode, iwChild).getShortName());
map.put(iwChild, newDTNode);
stack.push(iwChild);
}
curDTNode.sortChildrenList();
nodesList.add(curDTNode);
}
}
public DependencyTree (Sentence sentence, StanfordParser stanfordParser) {
this.gs = stanfordParser.getGrammaticalStructure(sentence.plainText);
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>();
nodesList = new ArrayList<DependencyTreeNode>();
List<TypedDependency> tdl = gs.typedDependencies(false);
// String[] sent = { "这", "是", "一个", "简单", "的", "句子", "。" };
String[] sent = sentence.getWordsArr();
List<CoreLabel> rawWords = SentenceUtils.toCoreLabelList(sent);
List<TypedDependency> tdl = stanfordParser.getTypedDependencyList(rawWords);
// 1. generate all nodes.
for (TypedDependency td : tdl) {
// gov
if (!map.containsKey(td.gov().index()) && !td.reln().getShortName().equals("root")) {
Word w = sentence.getWordByIndex(td.gov().index());
w.posTag = td.gov().tag(); // POS TAG
DependencyTreeNode newNode = new DependencyTreeNode(w);
map.put(td.gov().index(), newNode);
nodesList.add(newNode);
@@ -82,6 +44,7 @@ public class DependencyTree {
// dep
if (!map.containsKey(td.dep().index())) {
Word w = sentence.getWordByIndex(td.dep().index());
w.posTag = td.dep().tag(); // POS TAG
DependencyTreeNode newNode = new DependencyTreeNode(w);
map.put(td.dep().index(), newNode);
nodesList.add(newNode);
@@ -118,139 +81,9 @@ public class DependencyTree {
}
}
Collections.sort(nodesList, new DependencyTreeNodeComparator());
for (DependencyTreeNode dtn : nodesList) {
dtn.linkNN(this);
}
}
public DependencyTree (Sentence sentence, MaltParser maltParser)throws MaltChainedException {
try {
// the tokens are parsed in the following line
DependencyStructure graph = maltParser.getDependencyStructure(sentence);
this.maltGraph = graph;
//System.out.println(graph);
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>();
ArrayList<DependencyTreeNode> list = new ArrayList<DependencyTreeNode>();
Stack<DependencyNode> stack = new Stack<DependencyNode>();
DependencyNode nroot = graph.getDependencyRoot();
stack.add(nroot);
// 1. generate all nodes.
while (!stack.isEmpty()) {
DependencyNode n = stack.pop();
DependencyNode sib = n.getRightmostDependent();
int key = n.getIndex();
//System.out.println("[current node][key="+key+"] "+n+" <"+n.getHeadEdge()+">");
boolean flag = true;
while (sib != null) {
flag = false;
stack.push(sib);
sib = sib.getLeftSibling();
}
if (flag) {
sib = n.getLeftmostDependent();
while (sib != null) {
stack.push(sib);
sib = sib.getRightSibling();
}
}
if (n.hasHead() && !map.containsKey(key)) {
//String snode = n.toString();
String sedge = n.getHeadEdge().toString();
//System.out.println("[" + snode + "] <" + sedge + ">");

/*int position = 0;
String wordOriginal = null;
String wordBase;
String postag = null;*/
String dep = null;
int idx1, idx2;
/*// position
idx1 = snode.indexOf("ID:")+3;
idx2 = snode.indexOf(' ', idx1);
position = Integer.parseInt(snode.substring(idx1, idx2));
// word
idx1 = snode.indexOf("FORM:", idx2)+5;
idx2 = snode.indexOf(' ', idx1);
wordOriginal = snode.substring(idx1, idx2);
wordBase = Globals.coreNLP.getBaseFormOfPattern(wordOriginal.toLowerCase());
// postag
idx1 = snode.indexOf("POSTAG:", idx2)+7;
idx2 = snode.indexOf(' ', idx1);
postag = snode.substring(idx1, idx2);*/
// dep
idx1 = sedge.lastIndexOf(':')+1;
idx2 = sedge.lastIndexOf(' ');
dep = sedge.substring(idx1, idx2);
if (dep.equals("null")) {
dep = null;
}
else if (dep.equals("punct")) {// No consider about punctuation
continue;
}
DependencyTreeNode newNode = new DependencyTreeNode(sentence.getWordByIndex(key));
newNode.dep_father2child = dep;
map.put(key, newNode);
list.add(newNode);
}
}
// 2. add edges
for (Integer k : map.keySet()) {
DependencyNode n = graph.getDependencyNode(k);
DependencyTreeNode dtn = map.get(k);
if (dtn.dep_father2child == null) {
this.setRoot(dtn);
this.root.levelInTree = 0;
this.root.dep_father2child = "root";
}
else {
DependencyTreeNode father = map.get(n.getHead().getIndex());
DependencyTreeNode child = map.get(n.getIndex());
child.father = father;
father.childrenList.add(child);
}
}
// Fix the tree for some cases.
if(list.size() > 11)
{
DependencyTreeNode dt1 = list.get(11), dt2 = list.get(5);
if(dt1!=null && dt2!=null && dt1.word.baseForm.equals("star") && dt1.father.word.baseForm.equals("be"))
{
if (dt2.word.baseForm.equals("film") || dt2.word.baseForm.equals("movie"))
{
dt1.father.childrenList.remove(dt1);
dt1.father = dt2;
dt2.childrenList.add(dt1);
}
}
}
// add levelInTree, sort childrenList & nodesList
for (DependencyTreeNode dtn : list) {
if (dtn.father != null) {
dtn.levelInTree = dtn.father.levelInTree + 1;
dtn.sortChildrenList();
}
}
nodesList = list;
Collections.sort(nodesList, new DependencyTreeNodeComparator());
for (DependencyTreeNode dtn : nodesList) {
dtn.linkNN(this);
}
} catch (MaltChainedException e) {
//e.printStackTrace();
//System.err.println("MaltParser exception: " + e.getMessage());
throw e;
}
// for (DependencyTreeNode dtn : nodesList) {
// dtn.linkNN(this);
// }
}
public DependencyTreeNode setRoot(Word w) {


+ 49
- 25
src/nlp/ds/Sentence.java View File

@@ -2,10 +2,10 @@ package nlp.ds;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

import qa.Globals;
import qa.Query;
import rdf.MergedWord;

public class Sentence {
public String plainText = null;
@@ -18,40 +18,64 @@ public class Sentence {
public enum SentenceType {SpecialQuestion,GeneralQuestion,ImperativeSentence}
public SentenceType sentenceType = SentenceType.SpecialQuestion;
public Sentence (String s)
// public Sentence (String s)
// {
// plainText = s;
// words = Globals.coreNLP.getTaggedWords(plainText);
// map = new HashMap<String, Word>();
// for (Word w : words)
// map.put(w.key, w);
// }
// for tokenized sentence
public Sentence (List<Word> wordList, String s)
{
plainText = s;
words = Globals.coreNLP.getTaggedWords(plainText);
words = new Word[wordList.size()];
for(int i=0; i<wordList.size(); i++)
words[i] = wordList.get(i);
map = new HashMap<String, Word>();
for (Word w : words)
map.put(w.key, w);
}
public Sentence (Query query, String s)
{
plainText = s;
words = Globals.coreNLP.getTaggedWords(plainText);
// inherit NodeRecognition's information
for(Word word: words)
// public Sentence (Query query, String s)
// {
// plainText = s;
// words = Globals.coreNLP.getTaggedWords(plainText);
// // inherit NodeRecognition's information
// for(Word word: words)
// {
// for(MergedWord mWord: query.mWordList)
// {
// if(word.originalForm.equals(mWord.name))
// {
// word.mayLiteral = mWord.mayLiteral;
// word.mayEnt = mWord.mayEnt;
// word.mayType = mWord.mayType;
// word.mayCategory = mWord.mayCategory;
// word.tmList = mWord.tmList;
// word.emList = mWord.emList;
// word.category = mWord.category;
// }
// }
// }
// map = new HashMap<String, Word>();
// for (Word w : words)
// map.put(w.key, w);
// }

public String[] getWordsArr() {
String[] wordArr = new String[words.length];
int cnt = 0;
for(Word w: words)
{
for(MergedWord mWord: query.mWordList)
{
if(word.originalForm.equals(mWord.name))
{
word.mayLiteral = mWord.mayLiteral;
word.mayEnt = mWord.mayEnt;
word.mayType = mWord.mayType;
word.mayCategory = mWord.mayCategory;
word.tmList = mWord.tmList;
word.emList = mWord.emList;
word.category = mWord.category;
}
}
wordArr[cnt++] = w.originalForm;
}
map = new HashMap<String, Word>();
for (Word w : words)
map.put(w.key, w);
return wordArr;
}
public ArrayList<Word> getWordsByString (String w) {
ArrayList<Word> ret = new ArrayList<Word>();
for (Word wo: words) {


+ 0
- 201
src/nlp/tool/CoreNLP.java View File

@@ -1,201 +0,0 @@
package nlp.tool;

import java.util.List;
import java.util.Properties;

import nlp.ds.Word;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.semgraph.SemanticGraph;
import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.util.CoreMap;

public class CoreNLP {

// CoreNLP can also recognize TIME and NUMBER (see SUTime)
private StanfordCoreNLP pipeline_lemma;
public CoreNLP () {
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
/*Properties props_all = new Properties();
props_all.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // full list: "tokenize, ssplit, pos, lemma, ner, parse, dcoref"
pipeline_all = new StanfordCoreNLP(props_all);*/

Properties props_lemma = new Properties();
props_lemma.put("annotators", "tokenize, ssplit, pos, lemma");
pipeline_lemma = new StanfordCoreNLP(props_lemma);

}
// For more efficient usage, refer to "http://www.jarvana.com/jarvana/view/edu/stanford/nlp/stanford-corenlp/1.2.0/stanford-corenlp-1.2.0-javadoc.jar!/edu/stanford/nlp/process/Morphology.html"
public String getBaseFormOfPattern (String text) {
String ret = new String("");
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline_lemma.annotate(document);


// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
int count = 0;
for(CoreMap sentence: sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
// this is the base form (lemma) of the token
String lemma = token.getString(LemmaAnnotation.class);
ret += lemma;
ret += " ";
}
count ++;
if (count % 100 == 0) {
System.out.println(count);
}
}
return ret.substring(0, ret.length()-1);
}
public SemanticGraph getBasicDependencies (String s) {
// create an empty Annotation just with the given text
Annotation document = new Annotation(s);
// run all Annotators on this text
pipeline_lemma.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// this is the Stanford dependency graph of the current sentence
SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class);
return dependencies;
}
return null;
}

public Tree getParseTree (String text) {
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline_lemma.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// this is the parse tree of the current sentence
return sentence.get(TreeAnnotation.class);
}
return null;
}
/**
* How to use:
* for (CoreLabel token : sentence.get(TokensAnnotation.class)) {
* // this is the text of the token
* String word = token.get(TextAnnotation.class);
* // this is the POS tag of the token
* String pos = token.get(PartOfSpeechAnnotation.class);
* }
* @param s
* @return
*/
public CoreMap getPOS (String s) {
// create an empty Annotation just with the given text
Annotation document = new Annotation(s);
// run all Annotators on this text
pipeline_lemma.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// this is the sentence with POS Tags
return sentence;
}
return null;
}
public Word[] getTaggedWords (String sentence) {
CoreMap taggedSentence = getPOS(sentence);
Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()];
int count = 0;
for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
//System.out.println(word+"["+pos+"]");
ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1);
count ++;
}
return ret;
}
/*public void demo () {
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution
Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
// read some text in the text variable
String text = ... // Add your text here!
// create an empty Annotation just with the given text
Annotation document = new Annotation(text);
// run all Annotators on this text
pipeline.annotate(document);
// these are all the sentences in this document
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types
List<CoreMap> sentences = document.get(SentencesAnnotation.class);
for(CoreMap sentence: sentences) {
// traversing the words in the current sentence
// a CoreLabel is a CoreMap with additional token-specific methods
for (CoreLabel token: sentence.get(TokensAnnotation.class)) {
// this is the text of the token
String word = token.get(TextAnnotation.class);
// this is the POS tag of the token
String pos = token.get(PartOfSpeechAnnotation.class);
// this is the NER label of the token
String ne = token.get(NamedEntityTagAnnotation.class);
}

// this is the parse tree of the current sentence
Tree tree = sentence.get(TreeAnnotation.class);

// this is the Stanford dependency graph of the current sentence
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
}

// This is the coreference link graph
// Each chain stores a set of mentions that link to each other,
// along with a method for getting the most representative mention
// Both sentence and token offsets start at 1!
Map<Integer, CorefChain> graph =
document.get(CorefChainAnnotation.class);
}*/
}

+ 1
- 4
src/nlp/tool/Main.java View File

@@ -21,13 +21,10 @@ public class Main {
break;
try {
long t1 = System.currentTimeMillis();
Sentence s = new Sentence(question);
Sentence s = null;
DependencyTree dt = new DependencyTree(s, Globals.stanfordParser);
System.out.println("====StanfordDependencies====");
System.out.println(dt);
DependencyTree dt2 = new DependencyTree(s, Globals.maltParser);
System.out.println("====MaltDependencies====");
System.out.println(dt2);
long t2 = System.currentTimeMillis();
System.out.println("time=" + (t2-t1) + "ms");
} catch (Exception e) {


+ 0
- 70
src/nlp/tool/MaltParser.java View File

@@ -1,70 +0,0 @@
package nlp.tool;


import nlp.ds.Sentence;
import nlp.ds.Word;

import org.maltparser.MaltParserService;
import org.maltparser.core.exception.MaltChainedException;
import org.maltparser.core.syntaxgraph.DependencyStructure;

import qa.Globals;

public class MaltParser {
private MaltParserService service = null;
public MaltParser() {
try
{
System.out.print("Loading MaltParser ...");
service = new MaltParserService();
// Inititalize the parser model 'model0' and sets the working directory to '.' and sets the logging file to 'parser.log'
//service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w . -lfi parser.log");
service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w "+Globals.localPath+"lib/maltparser-1.9.1 -lfi parser.log");
firstParse();
System.out.println("ok!");
} catch (MaltChainedException e) {
e.printStackTrace();
System.err.println("MaltParser exception: " + e.getMessage());
}
}
private void firstParse() {
String[] tokens = new String[12];
tokens[0] = "1\tIn\t_\tIN\tIN\t_";
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_";
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_";
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_";
tokens[4] = "5\tby\t_\tIN\tIN\t_";
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_";
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_";
tokens[7] = "8\twas\t_\tVBD\tVBD\t_";
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_";
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_";
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_";
tokens[11] = "12\t?\t_\t.\t.\t_";
try {
service.parse(tokens);
} catch (MaltChainedException e) {
e.printStackTrace();
}
}
public DependencyStructure getDependencyStructure (Sentence sentence) {
try {
return service.parse(getTaggedTokens(sentence));
} catch (MaltChainedException e) {
e.printStackTrace();
}
return null;
}
private String[] getTaggedTokens (Sentence sentence) {
String[] ret = new String[sentence.words.length];
int count = 0;
for (Word w : sentence.words) {
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_");
count ++;
}
return ret;
}
}

+ 0
- 73
src/nlp/tool/MaltParserCon.java View File

@@ -1,73 +0,0 @@
package nlp.tool;

import java.io.File;
import java.net.URL;

import nlp.ds.Sentence;
import nlp.ds.Word;

import org.maltparser.concurrent.ConcurrentMaltParserModel;
import org.maltparser.concurrent.ConcurrentMaltParserService;
import org.maltparser.concurrent.graph.ConcurrentDependencyGraph;
import org.maltparser.core.exception.MaltChainedException;
//import org.maltparser.core.syntaxgraph.DependencyStructure;


public class MaltParserCon {
private ConcurrentMaltParserModel model = null;
public ConcurrentDependencyGraph outputGraph = null;
public MaltParserCon(){
try{
System.out.println("Loading Maltparser...\n");
URL ModelURL = new File("output/engmalt.linear-1.7.mco").toURI().toURL();
model = ConcurrentMaltParserService.initializeParserModel(ModelURL);
firstTest();
System.out.println("ok!\n");
}catch(Exception e){
e.printStackTrace();
System.err.println("MaltParser exception: " + e.getMessage());
}
}
private void firstTest(){
String[] tokens = new String[12];
tokens[0] = "1\tIn\t_\tIN\tIN\t_";
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_";
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_";
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_";
tokens[4] = "5\tby\t_\tIN\tIN\t_";
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_";
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_";
tokens[7] = "8\twas\t_\tVBD\tVBD\t_";
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_";
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_";
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_";
tokens[11] = "12\t?\t_\t.\t.\t_";
try {
outputGraph = model.parse(tokens);
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(outputGraph);
}
public ConcurrentDependencyGraph getDependencyStructure (Sentence sentence) {
try {
return model.parse(getTaggedTokens(sentence));
} catch (MaltChainedException e) {
e.printStackTrace();
}
return null;
}
private String[] getTaggedTokens (Sentence sentence) {
String[] ret = new String[sentence.words.length];
int count = 0;
for (Word w : sentence.words) {
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_");
count ++;
}
return ret;
}
}

+ 0
- 53
src/nlp/tool/NERecognizer.java View File

@@ -1,53 +0,0 @@
package nlp.tool;

import java.util.List;

import qa.Globals;

import nlp.ds.Sentence;
import nlp.ds.Word;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation;
import edu.stanford.nlp.ling.CoreLabel;

public class NERecognizer {
static String serializedClassifier;
static AbstractSequenceClassifier<CoreLabel> classifier;
//public static String localPath="E:\\Hanshuo\\gAnswer\\";
public NERecognizer() {
serializedClassifier = Globals.localPath+"lib/stanford-ner-2012-11-11/classifiers/english.all.3class.distsim.crf.ser.gz";
classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier);
}
/*public NERecognizer(String basePath, boolean flag) {
serializedClassifier = "WEB-INF\\lib\\stanford-ner-2012-11-11\\stanford-ner-2012-11-11\\classifiers\\english.all.3class.distsim.crf.ser.gz";
}*/
public void recognize(Sentence sentence) {
List<CoreLabel> lcl = classifier.classify(sentence.plainText).get(0);
for (CoreLabel cl : lcl) {
int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1;
Word w = sentence.getWordByIndex(position);
String ner = cl.get(AnswerAnnotation.class);
if (ner.equals("O")) w.ner = null;
else w.ner = ner;
}
}
public static void main(String[] args) {
System.out.println("Test NER");
Globals.init();
Sentence s = new Sentence("I go to school at Stanford University, which is located in California.");//"Which states of Germany are governed by the Social Democratic Party?"
Globals.nerRecognizer.recognize(s);
for (Word word : s.words) {
System.out.print(word + " ");
System.out.println("ner=" + word.ner);
}
}
}

+ 28
- 28
src/nlp/tool/StanfordParser.java View File

@@ -4,7 +4,6 @@ import java.io.StringReader;
import java.util.List;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.TokenizerFactory;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
@@ -13,39 +12,40 @@ import edu.stanford.nlp.trees.GrammaticalStructureFactory;
import edu.stanford.nlp.trees.PennTreebankLanguagePack;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreebankLanguagePack;
import edu.stanford.nlp.trees.TypedDependency;
import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure;

public class StanfordParser {
private LexicalizedParser lp;
private TokenizerFactory<CoreLabel> tokenizerFactory;
private TreebankLanguagePack tlp;
private GrammaticalStructureFactory gsf;
private ChineseGrammaticalStructure gs;
// private TokenizerFactory<CoreLabel> tokenizerFactory;
// private TreebankLanguagePack tlp;
// private GrammaticalStructureFactory gsf;
public StanfordParser() {
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
tlp = new PennTreebankLanguagePack();
gsf = tlp.grammaticalStructureFactory();
// lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
// tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
// tlp = new PennTreebankLanguagePack();
// gsf = tlp.grammaticalStructureFactory();
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz");
}
public GrammaticalStructure getGrammaticalStructure (String sentence) {
List<CoreLabel> rawWords2 =
tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
// Converts a Sentence/List/String into a Tree.
// In all circumstances, the input will be treated as a single sentence to be parsed.
Tree parse = lp.apply(rawWords2);

return gsf.newGrammaticalStructure(parse);
/*List<TypedDependency> tdl = gs.typedDependencies(false);
for (TypedDependency td : tdl) {
System.out.println(td.reln().getShortName()+"("+td.gov()+","+td.dep()+")");
System.out.println("gov="+td.gov()
+"\tgov.index="
+td.gov().index()
+"\tgov.value="
+td.gov().value()
+"\tgov.pos="
+((TreeGraphNode)td.gov().parent()).value());
}*/
//System.out.println(tdl);
// public GrammaticalStructure getGrammaticalStructure (String sentence) {
// List<CoreLabel> rawWords2 =
// tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
//
// Tree parse = lp.apply(rawWords2);
//
// return gsf.newGrammaticalStructure(parse);
// }
public List<TypedDependency> getTypedDependencyList(List<CoreLabel> rawWords)
{
Tree parse = lp.apply(rawWords);
gs = new ChineseGrammaticalStructure(parse);
return gs.typedDependenciesCCprocessed();
}
}

+ 46
- 109
src/paradict/ParaphraseDictionary.java View File

@@ -10,19 +10,17 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

import com.huaban.analysis.jieba.SegToken;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;



import nlp.tool.CoreNLP;
import qa.Globals;
import qa.extract.EntityRecognitionCh;

public class ParaphraseDictionary {
public static String localDataPath;
public static String dbpedia_relation_paraphrases_baseform_withScore;
public static String dbpedia_relation_paraphrases_baseform_withScore_rerank;
public static String dbpedia_relation_paraphrases_handwrite;
public static String dbpedia_predicate_id;
public static String relation_paraphrases_path;
public static String predicate_id_path;
public static String dbpedia_dbo_predicate;

public HashMap<String, Integer> predicate_2_id = null;
@@ -41,24 +39,14 @@ public class ParaphraseDictionary {
public int paraphrasedPredCount = 0;
public int lineCount = 0;
/**
* constructor
* @param parser
* @param ner
*/
public ParaphraseDictionary () {
String fixedPath = Globals.localPath;
String fixedPath = Globals.localPath+"data/pkubase/";

System.out.println(System.getProperty("user.dir"));
localDataPath = fixedPath + "data/DBpedia2016/parapharse/";
dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt";
dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt";
dbpedia_predicate_id = localDataPath + "16predicate_id.txt";
dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt";
relation_paraphrases_path = fixedPath + "paraphrase/pkubase-paraphrase.txt";
predicate_id_path = fixedPath + "fragments/id_mappings/pkubase_predicate_id.txt";
bannedTypes = new HashSet<String>();
bannedTypes.add("Mayor");
relns_subject = new HashSet<String>();
relns_subject.add("subj");
@@ -76,25 +64,16 @@ public class ParaphraseDictionary {
relns_object.add("obj");
relns_object.add("pobj");
prepositions = new HashSet<String>();
prepositions.add("in");//in at on with to from before after of for
prepositions.add("at");
prepositions.add("on");
prepositions.add("with");
prepositions.add("to");
prepositions.add("from");
prepositions.add("before");
prepositions.add("after");
prepositions.add("of");
prepositions.add("for");
prepositions.add("as");
prepositions = new HashSet<String>(); //TODO: safe delete

try {
loadPredicateId();
loadDboPredicate();
loadParaDict();
addPredicateAsNLPattern();
addHandwriteAsNLPattern();
// loadDboPredicate();
// loadParaDict();
buildInvertedIndex();
typePredicateID = predicate_2_id.get("type");
typePredicateID = predicate_2_id.get("类型");
} catch (Exception e) {
e.printStackTrace();
}
@@ -108,8 +87,7 @@ public class ParaphraseDictionary {
predicate_2_id = new HashMap<String, Integer>();
id_2_predicate = new HashMap<Integer, String>();
String input_filename = dbpedia_predicate_id;
File file = new File(input_filename);
File file = new File(predicate_id_path);
InputStreamReader in = null;
BufferedReader br = null;
try{
@@ -118,6 +96,8 @@ public class ParaphraseDictionary {
String line = null;
while ((line = br.readLine())!= null) {
String[] lines = line.split("\t");
if(lines[0].startsWith("<") && lines[0].endsWith(">"))
lines[0] = lines[0].substring(1, lines[0].length()-1);
predicate_2_id.put(lines[0], Integer.parseInt(lines[1]));
id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]);
}
@@ -192,13 +172,10 @@ public class ParaphraseDictionary {
InputStreamReader in = null;
BufferedReader br = null;
try{
String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank;
File file = new File(inputFileName);
in = new InputStreamReader(new FileInputStream(file), "utf-8");
in = new InputStreamReader(new FileInputStream(new File(relation_paraphrases_path)), "utf-8");
br = new BufferedReader(in);
String line = null;
int lineCount = 0;
//line = br.readLine();//read the first line which indicates the format
while ((line = br.readLine()) != null)
{
if (line.startsWith("#")) continue;
@@ -259,72 +236,23 @@ public class ParaphraseDictionary {
* A set of very important NL patterns are the predicates themselves!
*/
public void addPredicateAsNLPattern () {
if(nlPattern_2_predicateList == null)
nlPattern_2_predicateList = new HashMap<String, ArrayList<PredicateIDAndSupport>>();
final int support = 200;
int predicate_id;
for (String p : predicate_2_id.keySet())
{
// TODO: Omitting some bad relations (should be discarded in future)
if(p.equals("state") || p.equals("states"))
continue;
predicate_id = predicate_2_id.get(p);
StringBuilder pattern = new StringBuilder("");
// Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern
if(p.contains("/"))

// TODO: segmentation: 1) tokenize 2) single ch-word
String patternString = "";
List<SegToken> q=EntityRecognitionCh.segmenter.process(p, SegMode.SEARCH);
for (SegToken t:q)
{
if(p.charAt(0)>='A' && p.charAt(0)<='Z')
p = p.substring(p.indexOf("/")+1);
//gameW/l 1974
else
p = p.replace("/", "");
}
int last = 0, i = 0;
for(i = 0; i < p.length(); i ++) {
// if it were not a small letter, then break it.
if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) {
pattern.append(p.substring(last, i).toLowerCase());
pattern.append(" ");
last = i;
}
patternString += t.word + " ";
}
pattern.append(p.substring(last, i).toLowerCase());
for (i = 3; i < pattern.length(); i ++) {
// the blank between two digits should be deleted.
if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9'
&& pattern.charAt(i-1)==' '
&& pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') {
pattern.deleteCharAt(i-1);
}
// the blank between I and D should be deleted.
else if (pattern.charAt(i)=='d'
&& pattern.charAt(i-1)==' '
&& pattern.charAt(i-2)=='i'
&& pattern.charAt(i-3)==' ') {
pattern.deleteCharAt(i-1);
}
// the blank between D and B should be deleted.
else if (pattern.charAt(i)=='b'
&& pattern.charAt(i-1)==' '
&& pattern.charAt(i-2)=='d'
&& pattern.charAt(i-3)==' ') {
pattern.deleteCharAt(i-1);
}
}
// pattern -> base form
/*String[] ptns = pattern.toString().split(" ");
pattern = new StringBuilder("");
for (String s : ptns) {
pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s));
pattern.append(" ");
}
pattern.deleteCharAt(pattern.length()-1);
String patternString = pattern.toString();*/
// Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm
String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString());
patternString = patternString.trim();
//System.out.println(p + "-->" + patternString);
if (!nlPattern_2_predicateList.containsKey(patternString)) {
@@ -340,30 +268,39 @@ public class ParaphraseDictionary {
}
public void addHandwriteAsNLPattern() throws IOException {
String inputFileName = dbpedia_relation_paraphrases_handwrite;
InputStreamReader in = null;
BufferedReader br = null;
try{
File file = new File(inputFileName);
in = new InputStreamReader(new FileInputStream(file), "utf-8");
in = new InputStreamReader(new FileInputStream(new File(relation_paraphrases_path)), "utf-8");
br = new BufferedReader(in);
String line = null;
//int lineCount = 0;
//line = br.readLine();//read the first line which indicates the format
while ((line = br.readLine()) != null) {
if (line.startsWith("#") || line.isEmpty()) continue;
//lineCount ++;
String[] content = line.split("\t");
if(!predicate_2_id.containsKey(content[0]))
continue;
int predicateID = predicate_2_id.get(content[0]);
String nlPattern = content[1].toLowerCase();
String nlPattern = content[1];
int support = Integer.parseInt(content[2]);
// Need Segmentation
if(!nlPattern.contains(" "))
{
String patternString = "";
List<SegToken> q=EntityRecognitionCh.segmenter.process(nlPattern, SegMode.SEARCH);
for (SegToken t:q)
{
patternString += t.word + " ";
}
patternString = patternString.trim();
nlPattern = patternString;
}
if (!nlPattern_2_predicateList.containsKey(nlPattern)) {
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>());
}
@@ -434,7 +371,7 @@ public class ParaphraseDictionary {
}
public static void main (String[] args) {
Globals.coreNLP = new CoreNLP();
// Globals.coreNLP = new CoreNLP();
Globals.pd = new ParaphraseDictionary();
//Globals.pd.showNLPatterns();
}


+ 18
- 12
src/qa/GAnswer.java View File

@@ -32,8 +32,8 @@ public class GAnswer {
QueryLogger qlog = null;
try
{
if (input.length() <= 5)
return null;
// if (input.length() <= 5)
// return null;
System.out.println("[Input:] "+input);
@@ -47,17 +47,17 @@ public class GAnswer {
// Try to solve each NR plan, and combine the ranked SPARQLs.
// We only reserve LOG of BEST NR plan for convenience.
// Now only 1 plan
for(int i=query.sList.size()-1; i>=0; i--)
{
Sentence possibleSentence = query.sList.get(i);
qlog.reloadSentence(possibleSentence);
// qlog.isMaltParserUsed = true;
// LOG
System.out.println("transQ: "+qlog.s.plainText);
qlog.NRlog = query.preLog;
// qlog.NRlog = query.preLog;
qlog.SQGlog = "Id: "+query.queryId+"\nQuery: "+query.NLQuestion+"\n";
qlog.SQGlog += qlog.NRlog;
// qlog.SQGlog += qlog.NRlog;
qlog.timeTable.put("step0", (int)NRtime);
// step 1: question parsing (dependency tree, sentence type)
@@ -91,7 +91,7 @@ public class GAnswer {
qlog.rankedSparqls = rankedSparqls;
System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size());
// Detect question focus.
// Detect question focus. TODO: in which cases the question focus != target?
for (int i=0; i<qlog.rankedSparqls.size(); i++)
{
// First detect by SPARQLs.
@@ -156,7 +156,7 @@ public class GAnswer {
{
// modified by Lin Yinnian using ghttp - 2018-9-28
GstoreConnector gc = new GstoreConnector(Globals.QueryEngineIP, Globals.QueryEnginePort);
String answer = gc.query("root", "123456", "dbpedia16", spq.toStringForGStore2());
String answer = gc.query("endpoint", "123", "pkubase", spq.toStringForGStore2());
System.out.println(answer);
String[] rawLines = answer.split("\n");
@@ -199,9 +199,13 @@ public class GAnswer {
int i =1;
//file in/output
List<String> inputList = FileUtil.readFile("E:/Linyinnian/qald6_special.txt");
List<String> inputList = FileUtil.readFile("data/test/mini-ccks.txt");
for(String input: inputList)
{
if (input.length()<2 || input.charAt(0)!='q') continue;
System.out.println("----------------------------------------");
System.out.println(input);
ArrayList<String> outputs = new ArrayList<String>();
ArrayList<String> spqs = new ArrayList<String>();
spqs.add("id:"+String.valueOf(i));
@@ -220,9 +224,9 @@ public class GAnswer {
System.out.println("Ranked Sparqls: " + qlog.rankedSparqls.size());
outputs.add(qlog.SQGlog);
outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms");
outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms");
outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms");
// outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms");
// outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms");
// outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms");
long excuting_st_time = System.currentTimeMillis();
Matches m = null;
@@ -274,8 +278,10 @@ public class GAnswer {
outputs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n");
}
}
else
outputs.add("");
FileUtil.writeFile(outputs, "E:/Linyinnian/qald6_special_out.txt", true);
FileUtil.writeFile(outputs, "data/test/mini-ccks.out", true);
}
}


+ 12
- 36
src/qa/Globals.java View File

@@ -8,26 +8,18 @@ import lcn.EntityFragmentFields;
import fgmt.RelationFragment;
import fgmt.TypeFragment;
import paradict.ParaphraseDictionary;
import qa.mapping.DBpediaLookup;
import nlp.tool.NERecognizer;
import nlp.tool.CoreNLP;
import nlp.tool.MaltParser;
import nlp.tool.StanfordParser;
import nlp.tool.StopWordsList;

public class Globals {
// nlp tools
public static CoreNLP coreNLP;
public static StanfordParser stanfordParser;
public static StopWordsList stopWordsList;
public static MaltParser maltParser;
public static NERecognizer nerRecognizer;
// relation paraphrase dictionary
public static ParaphraseDictionary pd;
// entity linking system
public static DBpediaLookup dblk;
public static int MaxAnswerNum = 100;
public static String Dataset = "dbpedia 2016";
public static String Dataset = "pkubase";
public static String Version = "0.1.2";
public static String GDBsystem = "gStore v0.7.2";
@@ -39,34 +31,25 @@ public class Globals {
public static int evaluationMethod = 2;
public static String localPath = "./././";
public static String QueryEngineIP = "dbpedia16.gstore-pku.com"; // Notice, PORT number is in the evaluation function.
public static String QueryEngineIP = "pkubase.gstore-pku.com"; // Notice, PORT number is in the evaluation function.
public static int QueryEnginePort = 80;
public static void init ()
{
System.out.println("====== gAnswer2.0 over DBpedia ======");
System.out.println("====== gAnswer2.0 over Pkubase ======");

long t1, t2, t3, t4, t5, t6, t7, t8, t9;
t1 = System.currentTimeMillis();
coreNLP = new CoreNLP();
t2 = System.currentTimeMillis();
stanfordParser = new StanfordParser();
t3 = System.currentTimeMillis();
maltParser = new MaltParser();
t4 = System.currentTimeMillis();
nerRecognizer = new NERecognizer();
t5 = System.currentTimeMillis();
t2 = System.currentTimeMillis();
stopWordsList = new StopWordsList();
t6 = System.currentTimeMillis();
t3 = System.currentTimeMillis();
pd = new ParaphraseDictionary();
t7 = System.currentTimeMillis();
t4 = System.currentTimeMillis();
try
{
EntityFragmentFields.load();
@@ -78,20 +61,13 @@ public class Globals {
e1.printStackTrace();
}
t8 = System.currentTimeMillis();
dblk = new DBpediaLookup();
t9 = System.currentTimeMillis();
t5 = System.currentTimeMillis();
System.out.println("======Initialization======");
System.out.println("CoreNLP(Lemma): " + (t2-t1) + "ms.");
System.out.println("StanfordParser: " + (t3-t2) + "ms.");
System.out.println("MaltParser: " + (t4-t3) + "ms.");
System.out.println("NERecognizer: " + (t5-t4) + "ms.");
System.out.println("StopWordsList: " + (t6-t5) + "ms.");
System.out.println("ParaphraseDict & posTagPattern: " + (t7-t6) + "ms.");
System.out.println("GraphFragments: " + (t8-t7) + "ms.");
System.out.println("DBpediaLookup: " + (t9-t8) + "ms.");
System.out.println("* Total *: " + (t9-t1) + "ms.");
System.out.println("StanfordParser: " + (t2-t1) + "ms.");
System.out.println("StopWordsList: " + (t3-t2) + "ms.");
System.out.println("ParaphraseDict: " + (t4-t3) + "ms.");
System.out.println("GraphFragments: " + (t5-t4) + "ms.");
System.out.println("* Total *: " + (t5-t1) + "ms.");
System.out.println("==========================");
}



+ 32
- 35
src/qa/Query.java View File

@@ -1,10 +1,11 @@
package qa;

import java.util.ArrayList;
import java.util.List;

import nlp.ds.Sentence;
import qa.extract.EntityRecognition;
import rdf.MergedWord;
import nlp.ds.Word;
import qa.extract.EntityRecognitionCh;

/**
* 1. preprocessing of question
@@ -21,7 +22,7 @@ public class Query
public String queryId = null;
public String preLog = "";
public ArrayList<MergedWord> mWordList = null;
public List<Word> words = null;
public Query(){}
public Query(String _question)
@@ -32,15 +33,17 @@ public class Query
TransferedQuestion = getTransferedQuestion(NLQuestion);
// step1. NODE Recognition
MergedQuestionList = getMergedQuestionList(TransferedQuestion);
// MergedQuestionList = getMergedQuestionList(TransferedQuestion);
words = EntityRecognitionCh.parseSentAndRecogEnt(TransferedQuestion);
// build Sentence
sList = new ArrayList<Sentence>();
for(String mergedQuestion: MergedQuestionList)
{
Sentence sentence = new Sentence(this, mergedQuestion);
sList.add(sentence);
}
sList.add(new Sentence(words, TransferedQuestion)); // TODO: TransferedQuestion or _question
// for(String mergedQuestion: MergedQuestionList)
// {
// Sentence sentence = new Sentence(this, mergedQuestion);
// sList.add(sentence);
// }
}
public boolean isDigit(char ch)
@@ -66,6 +69,14 @@ public class Query
*/
public String getTransferedQuestion(String question)
{
//discard ? ! .
if(question.endsWith("?") || question.endsWith("。") || question.endsWith("!"))
question = question.substring(0, question.length()-1);
//discard 《》 because stanford parser DO NOT recognize them. TODO: why?
question = question.replace("《", "").replace("》", "");
question = question.replace("“", "").replace("”", ""); // now just discard "" because they confuse the parser.
//rule1: discard ".", because "." and "_" will be disconnected by parser. Discard word tail's "'", which may pollutes NER
question = question.replace("' ", " ");
String [] words = question.split(" ");
@@ -84,45 +95,31 @@ public class Query
ret = ret.substring(0,ret.length()-1);
ret = ret.replace("-", " ");
ret = ret.replace("in america", "in United States");
//rule2: as well as -> and
ret = ret.replace("as well as", "and");
//rule3: movie -> film
ret = ret.replace(" movie", " film");
ret = ret.replace(" movies", " films");

return ret;
}
/**
* Recognize entity & type & literal in KB and replace " " in Phrases with "_"
* @param question
* @return merged question list
*/
public ArrayList<String> getMergedQuestionList(String question)
{
ArrayList<String> mergedQuestionList = null;
//entity & type recognize
EntityRecognition er = new EntityRecognition();
mergedQuestionList = er.process(question);
preLog = er.preLog;
mWordList = er.mWordList;

return mergedQuestionList;
}
public String removeQueryId(String question)
{
String ret = question;
// case 1: 1\t
int st = question.indexOf("\t");
if(st!=-1 && question.length()>1 && question.charAt(0)>='0' && question.charAt(0)<='9')
if(st!=-1 && question.length()>4 && isDigit(question.charAt(0)))
{
queryId = question.substring(0,st);
ret = question.substring(st+1);
System.out.println("Extract QueryId :"+queryId);
}
// case 2: q1: | 1:
st = question.indexOf(":");
if(st!=-1 && st<6 && question.length()>4 && (isDigit(question.charAt(0)) ||question.startsWith("q")))
{
queryId = question.substring(0,st).replace("q", "");
ret = question.substring(st+1);
System.out.println("Extract QueryId :"+queryId);
}
return ret;
}
}

+ 0
- 864
src/qa/extract/EntityRecognition.java View File

@@ -1,864 +0,0 @@
package qa.extract;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;

import fgmt.EntityFragment;
import nlp.ds.Word;
import qa.Globals;
import rdf.EntityMapping;
import rdf.NodeSelectedWithScore;
import rdf.TypeMapping;
import rdf.MergedWord;
import utils.FileUtil;
import addition.*;

/**
* Core class of Node Recognition
* @author husen
*/
public class EntityRecognition {
public String preLog = "";
public String stopEntFilePath = Globals.localPath + "data/DBpedia2016/parapharse/stopEntDict.txt";
double EntAcceptedScore = 26;
double TypeAcceptedScore = 0.5;
double AcceptedDiffScore = 1;
public ArrayList<MergedWord> mWordList = null;
public ArrayList<String> stopEntList = null;
public ArrayList<String> badTagListForEntAndType = null;
ArrayList<ArrayList<Integer>> selectedList = null;
TypeRecognition tr = null;
AddtionalFix af = null;
public EntityRecognition()
{
// LOG
preLog = "";
loadStopEntityDict();
// Bad posTag for entity
badTagListForEntAndType = new ArrayList<String>();
badTagListForEntAndType.add("RBS");
badTagListForEntAndType.add("JJS");
badTagListForEntAndType.add("W");
badTagListForEntAndType.add(".");
badTagListForEntAndType.add("VBD");
badTagListForEntAndType.add("VBN");
badTagListForEntAndType.add("VBZ");
badTagListForEntAndType.add("VBP");
badTagListForEntAndType.add("POS");
// Additional fix for CATEGORY (in DBpedia)
af = new AddtionalFix();
tr = new TypeRecognition();
System.out.println("EntityRecognizer Initial : ok!");
}
public void loadStopEntityDict()
{
stopEntList = new ArrayList<String>();
try
{
List<String> inputs = FileUtil.readFile(stopEntFilePath);
for(String line: inputs)
{
if(line.startsWith("#"))
continue;
stopEntList.add(line);
}
}
catch (Exception e) {
e.printStackTrace();
}
}
public ArrayList<String> process(String question)
{
ArrayList<String> fixedQuestionList = new ArrayList<String>();
ArrayList<Integer> literalList = new ArrayList<Integer>();
HashMap<Integer, Double> entityScores = new HashMap<Integer, Double>();
HashMap<Integer, Integer> entityMappings = new HashMap<Integer, Integer>();
HashMap<Integer, Double> typeScores = new HashMap<Integer, Double>();
HashMap<Integer, String> typeMappings = new HashMap<Integer, String>();
HashMap<Integer, Double> mappingScores = new HashMap<Integer, Double>();
ArrayList<Integer> mustSelectedList = new ArrayList<Integer>();
System.out.println("--------- entity/type recognition start ---------");
Word[] words = Globals.coreNLP.getTaggedWords(question);
mWordList = new ArrayList<MergedWord>();
long t1 = System.currentTimeMillis();
int checkEntCnt = 0, checkTypeCnt = 0, hitEntCnt = 0, hitTypeCnt = 0, allCnt = 0;
boolean needRemoveCommas = false;
// Check entity & type
// Notice, ascending order by length
StringBuilder tmpOW = new StringBuilder();
StringBuilder tmpBW = new StringBuilder();
for(int len=1; len<=words.length; len++)
{
for(int st=0,ed=st+len; ed<=words.length; st++,ed++)
{
String originalWord = "", baseWord = "", allUpperWord = "";
//String[] posTagArr = new String[len];
for(int j=st; j<ed; j++)
{
//posTagArr[j-st] = words[j].posTag;
//originalWord += words[j].originalForm;
//baseWord += words[j].baseForm;
tmpOW.append(words[j].originalForm);
tmpBW.append(words[j].baseForm);
String tmp = words[j].originalForm;
if(tmp.length()>0 && tmp.charAt(0) >='a' && tmp.charAt(0)<='z')
{
String pre = tmp.substring(0,1).toUpperCase();
tmp = pre + tmp.substring(1);
}
allUpperWord += tmp;
if(j < ed-1)
{
//originalWord += "_";
//baseWord += "_";
tmpOW.append("_");
tmpBW.append("_");
}
}
originalWord = tmpOW.toString();
baseWord=tmpBW.toString();
tmpOW.setLength(0);
tmpBW.setLength(0);
allCnt++;
/*
* Filters to speed up and drop some bad cases.
*/
boolean entOmit = false, typeOmit = false;
int prep_cnt=0;
// Upper words can pass filter. eg: "Melbourne , Florida"
int UpperWordCnt = 0;
for(int i=st;i<ed;i++)
if((words[i].originalForm.charAt(0)>='A' && words[i].originalForm.charAt(0)<='Z')
|| ((words[i].posTag.equals(",") || words[i].originalForm.equals("'")) && i>st && i<ed-1))
UpperWordCnt++;
// Filters
if(UpperWordCnt<len || st==0)
{
if(st==0)
{
if(!words[st].posTag.startsWith("DT") && !words[st].posTag.startsWith("N"))
{
entOmit = true;
typeOmit = true;
}
}
else if(st>0)
{
Word formerWord = words[st-1];
//as princess
if(formerWord.baseForm.equals("as"))
entOmit = true;
//how many dogs?
if(formerWord.baseForm.equals("many"))
entOmit = true;
//obama's daughter ; your height | len=1 to avoid: Asimov's Foundation series
if(len == 1 && (formerWord.posTag.startsWith("POS") || formerWord.posTag.startsWith("PRP")))
entOmit = true;
//the father of you
if(ed<words.length)
{
Word nextWord = words[ed];
if(formerWord.posTag.equals("DT") && nextWord.posTag.equals("IN"))
entOmit = true;
}
//the area code of ; the official language of
boolean flag1=false, flag2=false;
for(int i=0;i<=st;i++)
if(words[i].posTag.equals("DT"))
flag1 = true;
for(int i=ed-1;i<words.length;i++)
if(words[i].posTag.equals("IN"))
flag2 = true;
if(flag1 && flag2)
entOmit = true;
}
if(ed < words.length)
{
Word nextWord = words[ed];
// (lowerCase)+(UpperCase)
if(nextWord.originalForm.charAt(0)>='A' && nextWord.originalForm.charAt(0)<='Z')
entOmit = true;
}
for(int i=st;i<ed;i++)
{
if(words[i].posTag.startsWith("I"))
prep_cnt++;
for(String badTag: badTagListForEntAndType)
{
if(words[i].posTag.startsWith(badTag))
{
entOmit = true;
typeOmit = true;
break;
}
}
if(words[i].posTag.startsWith("P") && (i!=ed-1 || len==1)){
entOmit = true;
typeOmit = true;
}
// First word
if(i==st)
{
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("EX") || words[i].posTag.startsWith("TO"))
{
entOmit = true;
typeOmit = true;
}
if(words[i].posTag.startsWith("D") && len==2){
entOmit = true;
typeOmit = true;
}
if(words[i].baseForm.startsWith("list") || words[i].baseForm.startsWith("many"))
{
entOmit = true;
typeOmit = true;
}
if(words[i].baseForm.equals("and"))
{
entOmit = true;
typeOmit = true;
}
}
// Last word.
if(i==ed-1)
{
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("D") || words[i].posTag.startsWith("TO"))
{
entOmit = true;
typeOmit = true;
}
if(words[i].baseForm.equals("and"))
{
entOmit = true;
typeOmit = true;
}
}
// Single word.
if(len==1)
{
//TODO: Omit general noun. eg: father, book ...
if(!words[i].posTag.startsWith("N"))
{
entOmit = true;
typeOmit = true;
}
}
}
// Too many preposition.
if(prep_cnt >= 3)
{
entOmit = true;
typeOmit = true;
}
}
/*
* Filter done.
*/
// Search category | highest priority
String category = null;
if(af.pattern2category.containsKey(baseWord))
{
typeOmit = true;
entOmit = true;
category = af.pattern2category.get(baseWord);
}
// Search type
int hitMethod = 0; // 1=dbo(baseWord), 2=dbo(originalWord), 3=yago|extend()
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>();
if(!typeOmit)
{
System.out.println("Type Check: "+originalWord);
//checkTypeCnt++;
//search standard type
tmList = tr.getTypeIDsAndNamesByStr(baseWord);
if(tmList == null || tmList.size() == 0)
{
tmList = tr.getTypeIDsAndNamesByStr(originalWord);
if(tmList != null && tmList.size()>0)
hitMethod = 2;
}
else
hitMethod = 1;
//Search extend type (YAGO type)
if(tmList == null || tmList.size() == 0)
{
tmList = tr.getExtendTypeByStr(allUpperWord);
if(tmList != null && tmList.size() > 0)
{
preLog += "++++ Extend Type detect: "+baseWord+": "+" prefferd relaiton:"+tmList.get(0).prefferdRelation+"\n";
hitMethod = 3;
}
}
}
// Search entity
ArrayList<EntityMapping> emList = new ArrayList<EntityMapping>();
if(!entOmit && !stopEntList.contains(baseWord))
{
System.out.println("Ent Check: "+originalWord);
checkEntCnt++;
// Notice, the second parameter is whether use DBpedia Lookup.
emList = getEntityIDsAndNamesByStr(originalWord, (UpperWordCnt>=len-1 || len==1),len);
if(emList == null || emList.size() == 0)
{
emList = getEntityIDsAndNamesByStr(baseWord, (UpperWordCnt>=len-1 || len==1), len);
}
if(emList!=null && emList.size()>10)
{
ArrayList<EntityMapping> tmpList = new ArrayList<EntityMapping>();
for(int i=0;i<10;i++)
{
tmpList.add(emList.get(i));
}
emList = tmpList;
}
}
MergedWord mWord = new MergedWord(st,ed,originalWord);
// Add category
if(category != null)
{
mWord.mayCategory = true;
mWord.category = category;
int key = st*(words.length+1) + ed;
mustSelectedList.add(key);
}
// Add literal
if(len==1 && checkLiteralWord(words[st]))
{
mWord.mayLiteral = true;
int key = st*(words.length+1) + ed;
literalList.add(key);
}
// Add type mappings
if(tmList!=null && tmList.size()>0)
{
// Drop by score threshold
if(tmList.get(0).score < TypeAcceptedScore)
typeOmit = true;

// Only allow EXACT MATCH when method=1|2
// TODO: consider approximate match and taxonomy. eg, actor->person
String likelyType = tmList.get(0).typeName.toLowerCase();
String candidateBase = baseWord.replace("_", ""), candidateOriginal = originalWord.replace("_", "").toLowerCase();
if(!candidateBase.equals(likelyType) && hitMethod == 1)
typeOmit = true;
if(!candidateOriginal.equals(likelyType) && hitMethod == 2)
typeOmit = true;
if(!typeOmit)
{
mWord.mayType = true;
mWord.tmList = tmList;
int key = st*(words.length+1) + ed;
typeMappings.put(key, tmList.get(0).typeName);
typeScores.put(key, tmList.get(0).score);
}
}
// Add entity mappings
if(emList!=null && emList.size()>0)
{
// Drop by score threshold
if(emList.get(0).score < EntAcceptedScore)
entOmit = true;
// Drop: the [German Shepherd] dog
else if(len > 2)
{
for(int key: entityMappings.keySet())
{
//int te=key%(words.length+1);
int ts=key/(words.length+1);
if(ts == st+1 && ts <= ed)
{
//DT in lowercase (allow uppercase, such as: [The Pillars of the Earth])
if(words[st].posTag.startsWith("DT") && !(words[st].originalForm.charAt(0)>='A'&&words[st].originalForm.charAt(0)<='Z'))
{
entOmit = true;
}
}
}
}
// Record info in merged word
if(!entOmit)
{
mWord.mayEnt = true;
mWord.emList = emList;
// use to remove duplicate and select
int key = st*(words.length+1) + ed;
entityMappings.put(key, emList.get(0).entityID);
// fix entity score | conflict resolution
double score = emList.get(0).score;
String likelyEnt = emList.get(0).entityName.toLowerCase().replace(" ", "_");
String lowerOriginalWord = originalWord.toLowerCase();
// !Award: whole match
if(likelyEnt.equals(lowerOriginalWord))
score *= len;
// !Award: COVER (eg, Robert Kennedy: [Robert] [Kennedy] [Robert Kennedy])
//e.g, Social_Democratic_Party -> all ents -> drop the overlapped smaller ones
//e.g, Abraham_Lincoln -> select the whole word
if(len>1)
{
boolean[] flag = new boolean[words.length+1];
ArrayList<Integer> needlessEntList = new ArrayList<Integer>();
double tmpScore=0;
for(int preKey: entityMappings.keySet())
{
if(preKey == key)
continue;
int te=preKey%(words.length+1),ts=preKey/(words.length+1);
for(int i=ts;i<te;i++)
flag[i] = true;
if(st<=ts && ed>= te)
{
needlessEntList.add(preKey);
tmpScore += entityScores.get(preKey);
}
}
int hitCnt = 0;
for(int i=st;i<ed;i++)
if(flag[i])
hitCnt++;
// WHOLE match || HIGH match & HIGH upper || WHOLE upper
if(hitCnt == len || ((double)hitCnt/(double)len > 0.6 && (double)UpperWordCnt/(double)len > 0.6) || UpperWordCnt == len || len>=4)
{
boolean commaTotalRight = true;
if(originalWord.contains(","))
{
String candidateCompactString = originalWord.replace(",","").replace("_", "").toLowerCase();
String likelyCompactEnt = likelyEnt.replace(",","").replace("_", "");
if(!candidateCompactString.equals(likelyCompactEnt))
commaTotalRight = false;
else
{
mWord.name = mWord.name.replace("_,_","_");
needRemoveCommas = true;
}
}
if(commaTotalRight)
{
mustSelectedList.add(key);
if(tmpScore>score)
score = tmpScore+1;
for(int preKey: needlessEntList)
{
entityMappings.remove(preKey);
mustSelectedList.remove(Integer.valueOf(preKey));
}
}
}
}
//NOTICE: score in mWord have no changes. we only change the score in entityScores.
entityScores.put(key,score);
}
}
if(mWord.mayCategory || mWord.mayEnt || mWord.mayType || mWord.mayLiteral)
mWordList.add(mWord);
}
}
/* Print all candidates (use fixed score).*/
System.out.println("------- Result ------");
for(MergedWord mWord: mWordList)
{
int key = mWord.st * (words.length+1) + mWord.ed;
if(mWord.mayCategory)
{
System.out.println("Detect category mapping: "+mWord.name+": "+ mWord.category +" score: 100.0");
preLog += "++++ Category detect: "+mWord.name+": "+mWord.category+" score: 100.0\n";
}
if(mWord.mayEnt)
{
System.out.println("Detect entity mapping: "+mWord.name+": [");
for(EntityMapping em: mWord.emList)
System.out.print(em.entityName + ", ");
System.out.println("]");
preLog += "++++ Entity detect: "+mWord.name+": "+mWord.emList.get(0).entityName+" score:"+entityScores.get(key)+"\n";
hitEntCnt++;
}
if(mWord.mayType)
{
System.out.println("Detect type mapping: "+mWord.name+": [");
for(TypeMapping tm: mWord.tmList)
System.out.print(tm.typeName + ", ");
System.out.println("]");
preLog += "++++ Type detect: "+mWord.name+": "+mWord.tmList.get(0).typeName +" score:"+typeScores.get(key)+"\n";
hitTypeCnt++;
}
if(mWord.mayLiteral)
{
System.out.println("Detect literal: "+mWord.name);
preLog += "++++ Literal detect: "+mWord.name+"\n";
}
}
/*
* Sort by score and remove duplicate.
* eg, <"video_game" "ent:Video game" "50.0"> <"a_video_game" "ent:Video game" "45.0">.
* Notice, reserve all information in mWordList.
*/
// one ENT maps different mergedWord in query, reserve the higher score.
ByValueComparator bvc = new ByValueComparator(entityScores,words.length+1);
List<Integer> keys = new ArrayList<Integer>(entityMappings.keySet());
Collections.sort(keys, bvc);
for(Integer key : keys)
{
if(!mappingScores.containsKey(entityMappings.get(key)))
mappingScores.put(entityMappings.get(key), entityScores.get(key));
else
entityMappings.remove(key);
}
selectedList = new ArrayList<ArrayList<Integer>>();
ArrayList<Integer> selected = new ArrayList<Integer>();
// Some phrases must be selected.
selected.addAll(mustSelectedList);
for(Integer key: typeMappings.keySet())
{
// !type(len>1) (Omit len=1 because: [Brooklyn Bridge] is a entity.
int ed = key%(words.length+1), st = key/(words.length+1);
if(st+1 < ed)
{
boolean beCovered = false;
//Entity cover type, eg:[prime_minister of Spain]
for(int preKey: entityMappings.keySet())
{
int te=preKey%(words.length+1),ts=preKey/(words.length+1);
//Entiy should longer than type
if(ts <= st && te >= ed && ed-st < te-ts)
{
beCovered = true;
}
}
if(!beCovered)
selected.add(key);
}
}
// Conflict resolution
ArrayList<Integer> noConflictSelected = new ArrayList<Integer>();
//select longer one when conflict
boolean[] flag = new boolean[words.length];
ByLenComparator blc = new ByLenComparator(words.length+1);
Collections.sort(selected,blc);
for(Integer key : selected)
{
int ed = key%(words.length+1), st = (key-ed)/(words.length+1);
boolean omit = false;
for(int i=st;i<ed;i++)
{
if(flag[i])
{
omit = true;
break;
}
}
if(omit)
continue;
for(int i=st;i<ed;i++)
flag[i]=true;
noConflictSelected.add(key);
}
// Scoring and ranking --> top-k decision
dfs(keys,0,noConflictSelected,words.length+1);
ArrayList<NodeSelectedWithScore> nodeSelectedWithScoreList = new ArrayList<NodeSelectedWithScore>();
for(ArrayList<Integer> select: selectedList)
{
double score = 0;
for(Integer key: select)
{
if(entityScores.containsKey(key))
score += entityScores.get(key);
if(typeScores.containsKey(key))
score += typeScores.get(key);
}
NodeSelectedWithScore tmp = new NodeSelectedWithScore(select, score);
nodeSelectedWithScoreList.add(tmp);
}
Collections.sort(nodeSelectedWithScoreList);
// Replace
int cnt = 0;
for(int k=0; k<nodeSelectedWithScoreList.size(); k++)
{
if(k >= nodeSelectedWithScoreList.size())
break;
selected = nodeSelectedWithScoreList.get(k).selected;
Collections.sort(selected);
int j = 0;
String res = question;
if(selected.size()>0)
{
res = words[0].originalForm;
int tmp = selected.get(j++), st = tmp/(words.length+1), ed = tmp%(words.length+1);
for(int i=1;i<words.length;i++)
{
if(i>st && i<ed)
{
res = res+"_"+words[i].originalForm;
}
else
{
res = res+" "+words[i].originalForm;
}
if(i >= ed && j<selected.size())
{
tmp = selected.get(j++);
st = tmp/(words.length+1);
ed = tmp%(words.length+1);
}
}
}
else
{
res = words[0].originalForm;
for(int i=1;i<words.length;i++)
{
res = res+" "+words[i].originalForm;
}
}
boolean ok = true;
for(String str: fixedQuestionList)
if(str.equals(res))
ok = false;
if(!ok)
continue;
if(needRemoveCommas)
res = res.replace("_,_","_");
System.out.println("Merged: "+res);
preLog += "plan "+cnt+": "+res+"\n";
fixedQuestionList.add(res);
cnt++;
if(cnt >= 3) // top-3
break;
}
long t2 = System.currentTimeMillis();
// preLog += "Total hit/check/all ent num: "+hitEntCnt+" / "+checkEntCnt+" / "+allCnt+"\n";
// preLog += "Total hit/check/all type num: "+hitTypeCnt+" / "+checkTypeCnt+" / "+allCnt+"\n";
preLog += "Node Recognition time: "+ (t2-t1) + "ms\n";
System.out.println("Total check time: "+ (t2-t1) + "ms");
System.out.println("--------- pre entity/type recognition end ---------");
return fixedQuestionList;
}
public void dfs(List<Integer> keys,int dep,ArrayList<Integer> selected,int size)
{
if(dep == keys.size())
{
ArrayList<Integer> tmpList = (ArrayList<Integer>) selected.clone();
selectedList.add(tmpList);
}
else
{
//off: dep-th mWord
dfs(keys,dep+1,selected,size);
//on: no conflict
boolean conflict = false;
for(int preKey: selected)
{
int curKey = keys.get(dep);
int preEd = preKey%size, preSt = (preKey-preEd)/size;
int curEd = curKey%size, curSt = (curKey-curEd)/size;
if(!(preSt<preEd && preEd<=curSt && curSt<curEd) && !(curSt<curEd && curEd<=preSt && preSt<preEd))
conflict = true;
}
if(!conflict)
{
selected.add(keys.get(dep));
dfs(keys,dep+1,selected,size);
selected.remove(keys.get(dep));
}
}
}
public ArrayList<EntityMapping> getEntityIDsAndNamesByStr(String entity, boolean useDblk, int len)
{
String n = entity;
ArrayList<EntityMapping> ret= new ArrayList<EntityMapping>();
//1. Lucene index
ret.addAll(EntityFragment.getEntityMappingList(n));
//2. DBpedia Lookup (some cases)
if (useDblk)
{
ret.addAll(Globals.dblk.getEntityMappings(n, null));
}
Collections.sort(ret);
if (ret.size() > 0) return ret;
else return null;
}
public int preferDBpediaLookupOrLucene(String entityName)
{
int cntUpperCase = 0;
int cntSpace = 0;
int cntPoint = 0;
int length = entityName.length();
for (int i=0; i<length; i++)
{
char c = entityName.charAt(i);
if (c==' ')
cntSpace++;
else if (c=='.')
cntPoint++;
else if (c>='A' && c<='Z')
cntUpperCase++;
}
if ((cntUpperCase>0 || cntPoint>0) && cntSpace<3)
return 1;
if (cntUpperCase == length)
return 1;
return 0;
}
static class ByValueComparator implements Comparator<Integer> {
HashMap<Integer, Double> base_map;
int base_size;
double eps = 1e-8;
int dblcmp(double a,double b)
{
if(a+eps < b)
return -1;
return b+eps<a ? 1:0;
}
public ByValueComparator(HashMap<Integer, Double> base_map, Integer size) {
this.base_map = base_map;
this.base_size = size;
}
public int compare(Integer arg0, Integer arg1) {
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) {
return 0;
}
if (dblcmp(base_map.get(arg0),base_map.get(arg1))<0) {
return 1;
}
else if (dblcmp(base_map.get(arg0),base_map.get(arg1))==0)
{
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size;
if (len0 < len1) {
return 1;
} else if (len0 == len1) {
return 0;
} else {
return -1;
}
}
else {
return -1;
}
}
}
static class ByLenComparator implements Comparator<Integer> {
int base_size;
public ByLenComparator(int size) {
this.base_size = size;
}
public int compare(Integer arg0, Integer arg1) {
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size;
if (len0 < len1) {
return 1;
} else if (len0 == len1) {
return 0;
} else {
return -1;
}
}
}
public boolean isDigit(char ch)
{
if(ch>='0' && ch<='9')
return true;
return false;
}
//TODO: other literal words.
public boolean checkLiteralWord(Word word)
{
boolean ok = false;
if(word.posTag.equals("CD"))
ok = true;
return ok;
}
public static void main (String[] args)
{
Globals.init();
EntityRecognition er = new EntityRecognition();
try
{
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
while (true)
{
System.out.println("Please input the question: ");
String question = br.readLine();
er.process(question);
}
} catch (IOException e) {
e.printStackTrace();
}
}

}

+ 566
- 0
src/qa/extract/EntityRecognitionCh.java View File

@@ -0,0 +1,566 @@
package qa.extract;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.InputStreamReader;

import lcn.EntityFragmentFields;

import com.huaban.analysis.jieba.JiebaSegmenter;
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode;
import com.huaban.analysis.jieba.SegToken;

import edu.stanford.nlp.util.Pair;
import fgmt.TypeFragment;
import qa.Query;
import rdf.EntityMapping;
import rdf.TypeMapping;
import nlp.ds.*;
import utils.FileUtil;

final class MODNUM
{
public static int prime=9999991;
}
//TODO: replace by nlp.ds.word
class Word
{
//type:0=normal word 1=entity 2=literal(string)
String word;
int type;
int pos=0;
List<String> entList=null;
Word(String w)
{
word=w;
type=0;
}
Word(String w,int i)
{
word=w;
type=i;
}
Word(String w,int i, int j)
{
word=w;
type=i;
pos=j;
}
Word(String w,int i, int j,List<String> l)
{
word=w;
type=i;
pos=j;
entList=l;
}
}

class Ent
{
public final int mod=MODNUM.prime;
public String entity_name,mention;
public int no;
public long hashe,hashm;
public Ent(String load)
{
int indexOf9=load.indexOf(9);
if (indexOf9>=0)
{
mention=load.substring(0, indexOf9);
String tmp=load.substring(indexOf9+1);
int t9=tmp.indexOf(9);
if (t9>=0)
{
entity_name=tmp.substring(0, t9);
String numberStr=tmp.substring(t9+1);
try
{
no=Integer.valueOf(numberStr);
}catch(Exception e){no=-1;};
}
else entity_name=tmp;
hashe=calHash(entity_name);
}
else
{
mention=load;
hashe=-1;
}
hashm=calHash(mention);
}
public long calHash(String p)
{
long x=0;
if (p==null || p.length()==0) return 0;
for (int i=0;i<p.length();i++)
{
x=x*65536+(long)(int)p.charAt(i);
x=x%mod;
}
return x;
}
@Override
public int hashCode()
{
return (int)hashm;
}
public Ent(){};
}

public class EntityRecognitionCh {
public static HashMap<String, List<String>> entMap,nentMap;
public static JiebaSegmenter segmenter = new JiebaSegmenter();
public final static int MaxEnt=20;
static
{
long t0 = System.currentTimeMillis();
List<String> nent = FileUtil.readFile("data/pkubase/paraphrase/ccksminutf.txt");
List<String> mention2ent = FileUtil.readFile("data/pkubase/paraphrase/mini-mention2ent.txt");

entMap=new HashMap<>();
nentMap=new HashMap<>();

System.out.println("Mention2Ent size: " + mention2ent.size());
for (String input:mention2ent)
{
Ent q=new Ent(input);
if (entMap.containsKey(q.mention))
entMap.get(q.mention).add(q.entity_name);
else
{
List<String> l=new ArrayList<>();
l.add(q.entity_name);
entMap.put(q.mention, l);
}
}
// mention: NOT ent word; entity_name: frequency
for (String input:nent)
{
Ent q=new Ent(input);
if (nentMap.containsKey(q.mention))
nentMap.get(q.mention).add(q.entity_name);
else
{
List<String> l=new ArrayList<>();
l.add(q.entity_name);
nentMap.put(q.mention, l);
}
}
long t1 = System.currentTimeMillis();
System.out.println("Read Mention2Ent used "+(t1-t0)+"ms");
}
public static boolean isAllNumber(String q)
{
boolean ret=true;
for (int i=0;i<q.length();i++)
{
if (q.charAt(i)<48 || q.charAt(i)>57) return false;
}
return ret;
}
public static String longestFirst2(String Question)
{
String ret="";
String input=Question.replace('{',' ').replace('}',' ');
int len=input.length();
int[][] ex=new int[len+3][];
Ent[][] entx=new Ent[len+3][];
for (int i=0;i<len+2;i++) ex[i]=new int[len+3];
for (int i=0;i<len+2;i++) entx[i]=new Ent[len+3];
for (int l=1;l<=len;l++)
{
int pos=0;
for (int j=l-1;j<len;j++)
{
String searchstr=input.substring(j-l+1,j+1);
List<String> rstlist=entMap.get(searchstr);

if (rstlist!=null && rstlist.size()>0)
{
++pos;
ex[l][pos]=j;
entx[l][pos]=new Ent(searchstr);
}
}
ex[l][0]=pos;
}
int covered[]=new int[len+3];
for (int l=len;l>=1;l--)
{
for (int p=1;p<=ex[l][0];p++)
{
int flag=1;
for (int k=ex[l][p];k>=ex[l][p]-l+1;k--) if (covered[k]>0) flag=0;
if (flag==1)
{
//1:占用 2:词头 4:词尾 8:其他
int FLAG=0;
List<String> nlist=nentMap.get(entx[l][p].mention);
if (nlist!=null && nlist.size()>0) FLAG=8;
if (isAllNumber(entx[l][p].mention)) FLAG=8;
covered[ex[l][p]]|=4;
covered[ex[l][p]-l+1]|=2;
for (int k=ex[l][p];k>=ex[l][p]-l+1;k--)
{
covered[k]|=1|FLAG;
}
}
}
}
for (int i=0;i<len;i++)
{
if ((covered[i]&2)!=0 && (covered[i]&8)==0) ret=ret+"{";
ret=ret+Question.charAt(i);
if ((covered[i]&4)!=0 && (covered[i]&8)==0) ret=ret+"}";
}
//System.out.println("Longest First: "+ret);
//System.out.println("Time: "+(t1-t0)+"ms");
return ret;
}
//1->①
public static String intToCircle(int i)
{
if (0>i || i>20) return null;
String ret="";
ret=ret+(char)(9311+i);
return ret;
}
//①->1
public static int circleToInt(String i)
{
int ret=i.charAt(0)-9311;
if (0<ret&& ret<20) return ret;
else return -1;
}
public static Pair<String,List<Word>> processedString(String s)
{
List<Word> ret=new ArrayList<>();
String sentence = "";
int flag=0;
String word="";
for (int i=0;i<s.length();i++)
{
if (s.charAt(i)=='{')
{
flag=1;
continue;
}
if (s.charAt(i)=='}')
{
if (word.length()<=2)
{
sentence+=word;
word="";
flag=0;
continue;
}
int FLAG=-1;
for (Word j:ret)
if (word.equals(j.word))
FLAG=j.pos;
if (FLAG==-1)
{
flag=0;
ret.add(new Word(word,1,ret.size()+1));
word="";
sentence+=intToCircle(ret.size());
continue;
}
else
{
flag=0;
word="";
sentence+=intToCircle(FLAG);
continue;
}
}
if (flag==0) sentence+=s.charAt(i);
if (flag==1) word=word+s.charAt(i);
}
return new Pair<String,List<Word>>(sentence,ret);
}
public static String reprocess(List<Word> d, List<SegToken> list)
{
String ret="";
int used[]=new int[list.size()+1];
int isValid[]=new int[list.size()+1];
for (int i=0;i<list.size();i++) isValid[i]=0;
for(int len=4;len>=1;len--)
{
for (int i=0;i<list.size()-len+1;i++)
{
String tmp="";
int flag=1;
for (int j=i;j<i+len;j++)
{
tmp=tmp+list.get(j).word;
if (tmp.length()>4) flag=0;
if (circleToInt(list.get(j).word)>=0) flag=0;
if (used[j]==1) flag=0;
}
if (flag==0) continue;
List<String> rstlist=entMap.get(tmp);
List<String> nlist=nentMap.get(tmp);
if (nlist!=null && nlist.size()>0)
{
for (int j=i;j<i+len;j++)
{
used[j]=1;
}
}
if (rstlist!=null && rstlist.size()>0 && (nlist==null||nlist.size()==0))
{
for (int j=i;j<i+len;j++) used[j]=1;
int pos=-1;
for (Word k:d) if (tmp.equals(k.word))
{
pos=k.pos;break;
}
if (pos>0)
{
isValid[i]=pos;
for (int j=i+1;j<i+len;j++)isValid[j]=-1;
}
else
{
d.add(new Word(tmp,1,d.size()+1));
isValid[i]=d.size();
for (int j=i+1;j<i+len;j++)isValid[j]=-1;
}
}

}
}
for (int i=0;i<list.size();i++)
{
if (isValid[i]==0)
{
ret=ret+list.get(i).word;
}
if (isValid[i]>0)
{
ret=ret+intToCircle(isValid[i]);
}
}
return ret;
}
public static String removeQueryId2(String question)
{
String ret = question;
int st = question.indexOf(":");
if(st!=-1 && st<6 && question.length()>4 && ((question.charAt(0)>='0' && question.charAt(0)<='9') ||question.charAt(0)=='q'))
{
ret = question.substring(st+1);
}
return ret;
}
public static String thirdprocess(String sentence,List<Word> d)
{
String temp="",rets2="";
int insyh=0;
int count=0;
List<Integer> lst=new ArrayList<>();
String syh="";
for (int i=0;i<sentence.length();i++)
{
if (circleToInt(""+sentence.charAt(i))!=-1)
{
count++;
}
else
{
if (count>=3)
{
String newent="";
for (int j=i-count;j<i;j++)
{
newent+=d.get(circleToInt(""+sentence.charAt(j))-1).word;
}
temp+=intToCircle(d.size());
d.add(new Word(newent,2,d.size()+1));
}
else
for (int j=i-count;j<i;j++)
{
temp+=sentence.charAt(j);
}
temp+=sentence.charAt(i);
count=0;
}
}
for (int i=0;i<temp.length();i++)
{
if (temp.charAt(i)=='"'&&insyh==0 || temp.charAt(i)=='“')
{
insyh=1;
syh="";
rets2+=temp.charAt(i);
}
else if (temp.charAt(i)=='"'&&insyh==1 || temp.charAt(i)=='”')
{
insyh=0;
if (lst.size()>=1)
{
String rp="";
for (int j=0;j<syh.length();j++)
{
int q=circleToInt(""+syh.charAt(j));
if (q==-1)
rp+=syh.charAt(j);
else
{
rp+=d.get(q-1).word;
//ret[q]="";
}
}
d.add(new Word(rp,2,d.size()+1));
rets2+=intToCircle(d.size())+temp.charAt(i);
}
else
{
rets2+=syh+temp.charAt(i);
}
}
else if (insyh==1)
{
if (circleToInt(""+temp.charAt(i))!=-1)
lst.add(circleToInt(""+temp.charAt(i)));
syh+=temp.charAt(i);
}
else
rets2+=temp.charAt(i);
}
return rets2;
}
public static Pair<String,List<Word>> parse(String input, JiebaSegmenter segmenter)
{
// input=removeQueryId2(input); // Remove query id before.
String newinput=longestFirst2 (input);

Pair<String,List<Word>> d=null,r=new Pair<String,List<Word>>();
r.second=new ArrayList<>();
try {
d=processedString(newinput);
} catch (Exception e) {
System.out.println(e);
}
if (d!=null)
{
//System.out.println(d.first);
List<SegToken> q=segmenter.process(d.first, SegMode.SEARCH);
String secondstr="";
for (SegToken t:q)
{
secondstr=secondstr+t.word+",";
}
//System.out.println("First process: "+secondstr);

String finalstring="";
String stickstr=reprocess(d.second,q);
String thirdstr=thirdprocess(stickstr,d.second);
List<SegToken> q2=segmenter.process(thirdstr, SegMode.SEARCH);
for (SegToken t:q2)
{
finalstring=finalstring+t.word+",";
int p=circleToInt(""+t.word.charAt(0));
if (p!=-1)
{
Word ds=d.second.get(p-1);
r.second.add(new Word(ds.word,ds.type,ds.pos,entMap.get(ds.word)));
}
else
{
r.second.add(new Word(t.word,0,-1));
}
}
System.out.println("Result: "+finalstring);
r.first=thirdstr;
return r;
}
else return null;
}
public static List<nlp.ds.Word> parseSentAndRecogEnt(String sent)
{
Pair<String, List<Word>> result = parse(sent, segmenter);
if(result == null)
return null;
List<nlp.ds.Word> words = new ArrayList<nlp.ds.Word>();
int position = 1;
for(Word ow: result.second)
{
// Note: jieba postag is deprecated, so we utilize stanford parser to get postag in later.
nlp.ds.Word word = new nlp.ds.Word(ow.word, ow.word, null, position++);
words.add(word);
if(ow.type == 1 && ow.entList != null)
{
// Now just consider TYPE there in a smiple way.
if(TypeFragment.typeShortName2IdList.containsKey(ow.word))
{
word.mayType = true;
word.tmList.add(new TypeMapping(TypeFragment.typeShortName2IdList.get(ow.word).get(0), ow.word, 100.0));
}
word.mayEnt = true;
word.emList = new ArrayList<EntityMapping>();
double score = 100;
for(String ent: ow.entList)
{
if(EntityFragmentFields.entityName2Id.containsKey(ent))
{
//TODO: consider more suitable entity score
int eid = EntityFragmentFields.entityName2Id.get(ent);
// String fstr = EntityFragmentFields.entityFragmentString.get(eid);
// System.out.println(eid+"\t"+fstr);
word.emList.add(new EntityMapping(eid, ent, score));
score -= 10;
}
}
}
else if(ow.type == 2)
word.mayLiteral = true;
// TODO: consider TYPE
}
return words;
}
public static void main(String[] args) throws IOException {
EntityFragmentFields.load();
List<String> inputList = FileUtil.readFile("data/test/mini-ccks.txt");
for(String input: inputList)
{
if (input.length()<2 || input.charAt(0)!='q') continue;
System.out.println("----------------------------------------");
System.out.println(input);
EntityRecognitionCh.parseSentAndRecogEnt(input);
}

}

}


+ 1
- 2
src/qa/extract/ExtractImplicitRelation.java View File

@@ -19,7 +19,6 @@ import log.QueryLogger;
import fgmt.EntityFragment;
import fgmt.TypeFragment;
import nlp.ds.Word;
import nlp.tool.CoreNLP;

public class ExtractImplicitRelation {
@@ -374,7 +373,7 @@ public class ExtractImplicitRelation {
public static void main(String[] args) throws Exception {
Globals.coreNLP = new CoreNLP();
// Globals.coreNLP = new CoreNLP();
Globals.pd = new ParaphraseDictionary();
try
{


+ 0
- 2
src/qa/extract/ExtractRelation.java View File

@@ -28,8 +28,6 @@ public class ExtractRelation {
public ArrayList<SimpleRelation> findRelationsBetweenTwoUnit(SemanticUnit su1, SemanticUnit su2, QueryLogger qlog)
{
DependencyTree T = qlog.s.dependencyTreeStanford;
if(qlog.isMaltParserUsed)
T = qlog.s.dependencyTreeMalt;
DependencyTreeNode n1 = T.getNodeByIndex(su1.centerWord.position), n2 = T.getNodeByIndex(su2.centerWord.position);
ArrayList<DependencyTreeNode> shortestPath = T.getShortestNodePathBetween(n1,n2);


+ 39
- 47
src/qa/extract/TypeRecognition.java View File

@@ -90,15 +90,7 @@ public class TypeRecognition {
if(allUpperFormWord.length() > 1 && allUpperFormWord.substring(1).equals(allUpperFormWord.substring(1).toLowerCase()))
return null;
//search in YAGO type
if(TypeFragment.yagoTypeList.contains(allUpperFormWord))
{
//YAGO prefix
String typeName = "yago:"+allUpperFormWord;
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1);
tmList.add(tm);
}
else if(extendTypeMap.containsKey(allUpperFormWord))
if(extendTypeMap.containsKey(allUpperFormWord))
{
String typeName = extendTypeMap.get(allUpperFormWord);
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1);
@@ -251,22 +243,22 @@ public class TypeRecognition {
}
}
// type
else if(sr.arg1Word.mayType)
else if(sr.arg1Word.mayType) //TODO: type
{
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries
if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of"))
&& !words[arg1WordPos-2].posTag.startsWith("V"))
{
sr.isArg1Constant = true;
double largerScore = 1000;
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
largerScore = sr.predicateMappings.get(0).score * 2;
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
sr.predicateMappings.add(0,nPredicate);
//constant type should be object
sr.preferredSubj = sr.arg2Word;
}
// if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of"))
// && !words[arg1WordPos-2].posTag.startsWith("V"))
// {
// sr.isArg1Constant = true;
// double largerScore = 1000;
// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
// largerScore = sr.predicateMappings.get(0).score * 2;
// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
// sr.predicateMappings.add(0,nPredicate);
//
// //constant type should be object
// sr.preferredSubj = sr.arg2Word;
// }
}
//ent: constant
else if(sr.arg1Word.mayEnt)
@@ -297,37 +289,37 @@ public class TypeRecognition {
else if(sr.arg2Word.mayType)
{
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries
if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of"))
&& !words[arg2WordPos-2].posTag.startsWith("V") )
{
sr.isArg2Constant = true;
double largerScore = 1000;
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
largerScore = sr.predicateMappings.get(0).score * 2;
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
sr.predicateMappings.add(0,nPredicate);
sr.preferredSubj = sr.arg1Word;
}
// if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of"))
// && !words[arg2WordPos-2].posTag.startsWith("V") )
// {
// sr.isArg2Constant = true;
// double largerScore = 1000;
// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
// largerScore = sr.predicateMappings.get(0).score * 2;
// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
// sr.predicateMappings.add(0,nPredicate);
//
// sr.preferredSubj = sr.arg1Word;
// }
//rule: Be ... a type?
if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a"))
{
sr.isArg2Constant = true;
double largerScore = 1000;
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
largerScore = sr.predicateMappings.get(0).score * 2;
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
sr.predicateMappings.add(0,nPredicate);
sr.preferredSubj = sr.arg1Word;
}
// if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a"))
// {
// sr.isArg2Constant = true;
// double largerScore = 1000;
// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0)
// largerScore = sr.predicateMappings.get(0).score * 2;
// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]");
// sr.predicateMappings.add(0,nPredicate);
//
// sr.preferredSubj = sr.arg1Word;
// }
}
else if(sr.arg2Word.mayEnt)
{
sr.isArg2Constant = true;
}
if(sr.arg1Word != sr.preferredSubj)
if(sr.arg2Word == sr.preferredSubj)
sr.swapArg1Arg2();
}
}


+ 0
- 163
src/qa/mapping/DBpediaLookup.java View File

@@ -1,163 +0,0 @@
package qa.mapping;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.HashMap;

import lcn.EntityFragmentFields;
import log.QueryLogger;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.methods.GetMethod;

import fgmt.EntityFragment;
import rdf.EntityMapping;

public class DBpediaLookup {
//There are two websites of the DBpediaLookup online service.
//public static final String baseURL = "http://en.wikipedia.org/w/api.php?action=opensearch&format=xml&limit=10&search=";
public static final String baseURL = "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString=";
public HttpClient ctripHttpClient = null;
//public static final String begin = "<Text xml:space=\"preserve\">";
//public static final String begin = "<Result>\n <Label>";
public static final String begin = "<Result>\n <Label>";
public static final int begin_length = begin.length();
//public static final String end = "</Text>";
public static final String end = "</Label>";
public static final int end_length = end.length();
public static HashMap<String, String>entMentionDict = null; // TODO: base on redirect data & wikipedia click data to build mention2ent's dictionary, now just manually
public DBpediaLookup()
{
ctripHttpClient = new HttpClient();
ctripHttpClient.setTimeout(3000);
entMentionDict = new HashMap<String, String>();
entMentionDict.put("Prince_Charles", "Charles,_Prince_of_Wales");
}
public ArrayList<EntityMapping> getEntityMappings(String searchString, QueryLogger qlog)
{
ArrayList<String> slist = new ArrayList<String>();
if(entMentionDict.containsKey(searchString))
slist.add(entMentionDict.get(searchString));
else
slist = lookForEntityNames(searchString, qlog);
if (slist.size() == 0 && searchString.contains(". "))
slist.addAll(lookForEntityNames(searchString.replaceAll(". ", "."), qlog));
ArrayList<EntityMapping> emlist = new ArrayList<EntityMapping>();
// Now string use "_" as delimiter (original)
String[] sa = searchString.split("_");
int UpperCnt = 0;
for(String str: sa)
{
if( (str.charAt(0)>='A'&&str.charAt(0)<='Z') || (str.charAt(0)>='0'&&str.charAt(0)<='9') )
UpperCnt ++;
}
System.out.print("DBpediaLookup find: " + slist + ", ");
int count = 40;
for (String s : slist)
{
//consider ABBR only when all UPPER; drop when too long edit distance
if(UpperCnt < sa.length && EntityFragment.calEditDistance(s, searchString.replace("_", ""))>searchString.length()/2)
continue;
int eid = -1;
s = s.replace(" ", "_");
if(EntityFragmentFields.entityName2Id.containsKey(s))
{
eid = EntityFragmentFields.entityName2Id.get(s);
emlist.add(new EntityMapping(eid, s, count));
count -=2 ;
}
else
{
System.out.print("Drop "+s+" because it not in Entity Dictionary. ");
}
}
System.out.println("DBpediaLookup select: " + emlist);
return emlist;
}
public ArrayList<String> lookForEntityNames (String searchString, QueryLogger qlog) {
// URL transition: " " -> %20
GetMethod getMethod = new GetMethod((baseURL+searchString).replaceAll(" ", "%20"));
ArrayList<String> ret = new ArrayList<String>();
int statusCode;
try {
statusCode = ctripHttpClient.executeMethod(getMethod);
} catch (HttpException e) {
e.printStackTrace();
return ret;
} catch (IOException e) {
e.printStackTrace();
return ret;
}
if (statusCode!=200) return null;
String response = getMethod.getResponseBodyAsString();
if (qlog != null && qlog.MODE_debug) {
System.out.println("searchString=" + searchString);
System.out.println("statusCode=" + statusCode);
System.out.println("response=" + getMethod.getResponseBodyAsString());
}
getMethod.releaseConnection();
//System.out.println(response);
if (response == null || response.isEmpty())
return ret;
int idx1 = response.indexOf(begin);
while (idx1 != -1) {
int idx2 = response.indexOf(end, idx1+begin_length);
String ss = response.substring(idx1+begin_length, idx2);
ret.add(ss);
//System.out.println(ss);
idx1 = response.indexOf(begin, idx2 + end_length);
}

return ret;
}
public static void main(String argv[]){
DBpediaLookup dbplook = new DBpediaLookup();
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
try {
while (true) {
System.out.println("Test DBpediaLookup.");
System.out.print("Please input the search string: ");
String searchString = br.readLine();
try {
long t1 = System.currentTimeMillis();
ArrayList<String> res = dbplook.lookForEntityNames(searchString, null);
long t2 = System.currentTimeMillis();
System.out.println(res);
System.out.println("time=" + (t2-t1) + "ms");
} catch (Exception e) {
e.printStackTrace();
}
}
} catch (IOException e) {
e.printStackTrace();
}

return;
}
}

+ 112
- 197
src/qa/parsing/BuildQueryGraph.java View File

@@ -37,84 +37,19 @@ public class BuildQueryGraph
public BuildQueryGraph()
{
whList.add("what");
whList.add("which");
whList.add("who");
whList.add("whom");
whList.add("when");
whList.add("how");
whList.add("where");
whList.add("什么");
whList.add("什么时候");
whList.add("哪些");
whList.add("哪里");
whList.add("谁");
// Bad words for NODE. (base form)
// We will train a node recognition model to replace such heuristic rules further.
stopNodeList.add("list");
stopNodeList.add("give");
stopNodeList.add("show");
stopNodeList.add("star");
stopNodeList.add("theme");
stopNodeList.add("world");
stopNodeList.add("independence");
stopNodeList.add("office");
stopNodeList.add("year");
stopNodeList.add("work");
}
public void fixStopWord(QueryLogger qlog, DependencyTree ds)
{
String qStr = qlog.s.plainText.toLowerCase();
//... [which]
for(int i=2;i<qlog.s.words.length;i++)
if(qlog.s.words[i].baseForm.equals("which"))
stopNodeList.add(qlog.s.words[i].baseForm);
//take [place]
if(qStr.contains("take place") || qStr.contains("took place"))
stopNodeList.add("place");
//(When was Alberta admitted) as [province]
if(qStr.contains("as province"))
stopNodeList.add("province");
//what form of government is found in ...
if(qStr.contains("form of government"))
stopNodeList.add("government");
//alma mater of the chancellor
if(qStr.contains("alma mater of the chancellor"))
{
stopNodeList.add("chancellor");
}
//How large is the area of UK?
if(qStr.contains("the area of") || qStr.contains("how big"))
{
stopNodeList.add("area");
}
//how much is the total population of european union?
if(qStr.contains("how much"))
{
stopNodeList.add("population");
stopNodeList.add("elevation");
}
//when was the founding date of french fifth republic
if(qStr.contains("when was the"))
{
stopNodeList.add("founding");
stopNodeList.add("date");
stopNodeList.add("death");
stopNodeList.add("episode");
}
if(qStr.contains("what other book"))
{
stopNodeList.add("book");
}
//Is [Michelle Obama] the [wife] of Barack Obama?
if(qlog.s.words[0].baseForm.equals("be") && isNode(ds.getNodeByIndex(2)) && ds.getNodeByIndex(3).dep_father2child.equals("det")
&& isNode(ds.getNodeByIndex(4)) && qlog.s.words[4].baseForm.equals("of"))
stopNodeList.add(ds.getNodeByIndex(4).word.baseForm);
stopNodeList.add("信仰");
stopNodeList.add("人");
}

// Semantic Parsing for DBpedia.
// Semantic Parsing for Pkubase.
public ArrayList<SemanticUnit> process(QueryLogger qlog)
{
try
@@ -135,15 +70,15 @@ public class BuildQueryGraph
* 3)Coreference resolution.
* */
//0) Fix stop words
fixStopWord(qlog, ds);
// fixStopWord(qlog, ds);
//1) Detect Modifier/Modified
//rely on sentence (rather than dependency tree)
//with some ADJUSTMENT (eg, ent+noun(noType&&noEnt) -> noun.omitNode=TRUE)
for(Word word: qlog.s.words)
getTheModifiedWordBySentence(qlog.s, word); //Find continuous modifier
for(Word word: qlog.s.words)
getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier
// for(Word word: qlog.s.words)
// getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier
for(Word word: qlog.s.words)
if(word.modifiedWord == null) //Other words modify themselves. NOTICE: only can be called after detecting all modifier.
word.modifiedWord = word;
@@ -167,9 +102,9 @@ public class BuildQueryGraph
qlog.target = target.word;
// !target can NOT be entity. (except general question)| which [city] has most people?
if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.emList!=null)
// only when target.mayType=True or exist other entities.
if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.mayEnt && target.word.mayType)
{
//Counter example:Give me all Seven_Wonders_of_the_Ancient_World | (in fact, it not ENT, but CATEGORY, ?x subject Seve...)
target.word.mayEnt = false;
target.word.emList.clear();
}
@@ -241,6 +176,17 @@ public class BuildQueryGraph
curSU.neighborUnitList.add(expandSU);
}
}
if(semanticUnitList.size() == 1 && target.word.mayEnt)
{
Word[] words = qlog.s.words;
SemanticUnit curSU = semanticUnitList.get(0);
SemanticUnit expandSU = new SemanticUnit(words[words.length-1], false);
semanticUnitList.add(expandSU);
curSU.neighborUnitList.add(expandSU);
expandSU.neighborUnitList.add(curSU);
target = ds.getNodeByIndex(words.length);
qlog.target = target.word;
}
qlog.timeTable.put("BQG_structure", (int)(System.currentTimeMillis()-t));
//step2: Find relations (Notice, we regard that the coreference have been resolved now)
@@ -251,7 +197,7 @@ public class BuildQueryGraph
qlog.timeTable.put("BQG_relation", (int)(System.currentTimeMillis()-t));
//Prepare for item mapping
TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary
// TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary
TypeRecognition.constantVariableRecognition(qlog.semanticRelations, qlog); // Constant or Variable, embedded triples
//(just for display)
@@ -361,7 +307,7 @@ public class BuildQueryGraph
tmpRelations = new ArrayList<SimpleRelation>();
//Copy relations (for 'and', 'as soon as'...) |eg, In which films did Julia_Roberts and Richard_Gere play?
//TODO: judge by dependency tree | other way to supplement relations
if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals("and"))
if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals(""))
{
for(SimpleRelation sr: simpleRelations)
{
@@ -566,6 +512,7 @@ public class BuildQueryGraph
return false;
}
// detect the target (question focus), also to detect some co-reference via rules. (TODO: test existing utils for co-reference resolution)
public DependencyTreeNode detectTarget(DependencyTree ds, QueryLogger qlog)
{
visited.clear();
@@ -583,8 +530,10 @@ public class BuildQueryGraph
// No Wh-Word: use the first node; NOTICE: consider MODIFIER rules. E.g, was us president Obama ..., target=obama (rather us)
if(target == null)
{
for(Word word: words)
//Chinese sentence: the question focus is usually in the tail.
for(int i=words.length-1; i>=0; i--)
{
Word word = words[i];
Word modifiedWord = word.modifiedWord;
if(modifiedWord != null && isNodeCandidate(modifiedWord))
{
@@ -594,42 +543,25 @@ public class BuildQueryGraph
}
if(target == null)
target = ds.nodesList.get(0);
/* Are [E|tree_frogs] a type of [E|amphibian] , type
*/
for(DependencyTreeNode dtn: target.childrenList)
{
if(dtn.word.baseForm.equals("type"))
{
dtn.word.represent = target.word;
}
}
target = ds.nodesList.get(0);
}
//where, NOTICE: wh target from NN may not pass the function isNode()
if(target.word.baseForm.equals("where"))
//where
if(target.word.baseForm.equals("哪里"))
{
int curPos = target.word.position - 1;
//!Where is the residence of
if(words[curPos+1].baseForm.equals("be") && words[curPos+2].posTag.equals("DT"))
//大兴安岭的[终点]是(哪里)
if(curPos-2>=0 && isNodeCandidate(words[curPos-2]) && words[curPos-1].baseForm.equals("是"))
{
for(int i=curPos+4;i<words.length;i++)
if(words[i-1].posTag.startsWith("N") && words[i].posTag.equals("IN"))
{
target.word.represent = words[i-1];
target = ds.getNodeByIndex(i);
break;
}
target.word.represent = words[curPos-1];
target = ds.getNodeByIndex(words[curPos-1].position);
}
}
//which
if(target.word.baseForm.equals("which"))
if(target.word.baseForm.equals("哪些") || target.word.baseForm.equals("哪个"))
{
// test case: In which US state is Mount_McKinley located
// test case: 韩国有哪些著名景点?
int curPos = target.word.position-1;
if(curPos+1 < words.length)
{
@@ -639,27 +571,10 @@ public class BuildQueryGraph
// which city ... target = city
target.word.represent = word1;
target = ds.getNodeByIndex(word1.position);
int word1Pos = word1.position - 1;
// word1 + be + (the) + word2, and be is root: word1 & word2 may coreference
if(ds.root.word.baseForm.equals("be") && word1Pos+3 < words.length && words[word1Pos+1].baseForm.equals("be"))
{
// which city is [the] headquarters ...
Word word2 = words[word1Pos+2].modifiedWord;
if(words[word1Pos+2].posTag.equals("DT"))
word2 = words[word1Pos+3].modifiedWord;
int word2Pos = word2.position - 1;
if(word2Pos+1 < words.length && isNodeCandidate(word2) && words[word2Pos+1].posTag.startsWith("IN"))
{
//In which city is [the] headquarters of ... | target = headquarters, city & headquarters: coreference
//In which city was the president of Montenegro born? | COUNTER example, city & president: independent
target.word.represent = word2;
target = ds.getNodeByIndex(word2.position);
}
}
}
}
// by dependency tree
if(target.word.baseForm.equals("which"))
if(target.word.baseForm.equals("哪些") || target.word.baseForm.equals("哪个"))
{
//Which of <films> had the highest budget
boolean ok = false;
@@ -683,14 +598,14 @@ public class BuildQueryGraph
}
//what
else if(target.word.baseForm.equals("what"))
else if(target.word.baseForm.equals("什么"))
{
//Detect:what is [the] sth1 prep. sth2?
//Detect:龙卷风的[英文名]是(什么) | 金轮国师的(什么)[武功]有十龙十象之力?
//Omit: what is sth?
if(target.father != null && ds.nodesList.size()>=5)
{
DependencyTreeNode tmp1 = target.father;
if(tmp1.word.baseForm.equals("be"))
if(tmp1.word.baseForm.equals(""))
{
for(DependencyTreeNode child: tmp1.childrenList)
{
@@ -698,15 +613,13 @@ public class BuildQueryGraph
continue;
if(isNode(child))
{
//sth1
boolean hasPrep = false;
boolean another_node = false;
for(DependencyTreeNode grandson: child.childrenList)
{ //prep
if(grandson.dep_father2child.equals("prep"))
hasPrep = true;
}
//Detect modifier: what is the sht1's [sth2]? | what is the largest [city]?
if(hasPrep || qlog.s.hasModifier(child.word))
if(isNode(grandson))
another_node = true;
//more than 2 nodes || Detect modifier: what is the sht1's [sth2]? | what is the largest [city]?
if(another_node || qlog.s.hasModifier(child.word))
{
target.word.represent = child.word;
target = child;
@@ -715,82 +628,84 @@ public class BuildQueryGraph
}
}
}
//what sth || What airlines are (part) of the SkyTeam alliance?
//what sth: 什么山高于8000米
else if(isNode(tmp1))
{
target.word.represent = tmp1.word;
target = tmp1;
// Coreference resolution
int curPos = target.word.position - 1;
if(curPos+3<words.length && words[curPos+1].baseForm.equals("be")&&words[curPos+3].posTag.startsWith("IN") && words.length > 6)
{
words[curPos+2].represent = target.word;
}
target = tmp1;
}
}
// by sentence
if(target.word.baseForm.equals("what"))
if(target.word.baseForm.equals("什么"))
{
// 金轮国师的(什么)[武功]有十龙十象之力?
int curPos = target.word.position - 1;
// what be the [node] ... ? (Notice: words.length CONTAINS symbol(?),different from nodeList)
if(words.length > 5 && words[curPos+1].baseForm.equals("be") && words[curPos+2].baseForm.equals("the") && isNodeCandidate(words[curPos+3]))
if(curPos + 1 <= words.length - 1 && isNodeCandidate(words[curPos+1]))
{
target.word.represent = words[curPos+3];
target = ds.getNodeByIndex(words[curPos+3].position);
target.word.represent = words[curPos+1];
target = ds.getNodeByIndex(words[curPos+1].position);
}
}
}
//who
else if(target.word.baseForm.equals("who"))
else if(target.word.baseForm.equals(""))
{
//Detect:who is/does [the] sth1 prep. sth2? || Who was the pope that founded the Vatican_Television ? | Who does the voice of Bart Simpson?
//Detect:武汉大学的现任[校长]是(谁)? 和子女一起演过电影电视剧的[演员]有(谁)?
//Others: who is sth? who do sth? | target = who
//test case: Who is the daughter of Robert_Kennedy married to?
if(ds.nodesList.size()>=5)
{ //who
for(DependencyTreeNode tmp1: ds.nodesList)
{
if(tmp1 != target.father && !target.childrenList.contains(tmp1))
continue;
if(tmp1.word.baseForm.equals("be") || tmp1.word.baseForm.equals("do"))
{ //is
for(DependencyTreeNode child: tmp1.childrenList)
{
if(child == target)
continue;
if(isNode(child))
{ //sth1
boolean hasPrep = false;
for(DependencyTreeNode grandson: child.childrenList)
{ //prep
if(grandson.dep_father2child.equals("prep"))
hasPrep = true;
}
//Detect modifier: who is the sht1's sth2?
// if(hasPrep || qlog.s.plainText.contains(child.word.originalForm + " 's")) // replaced by detect modifier directly
if(hasPrep || qlog.s.hasModifier(child.word))
{
target.word.represent = child.word;
target = child;
break;
}
}
}
}
}
}
//test case: 湖上草是[谁]的(诗)?
// if(ds.nodesList.size()>=5)
// { //who
// for(DependencyTreeNode tmp1: ds.nodesList)
// {
// if(tmp1 != target.father && !target.childrenList.contains(tmp1))
// continue;
// if(tmp1.word.baseForm.equals("be") || tmp1.word.baseForm.equals("do"))
// { //is
// for(DependencyTreeNode child: tmp1.childrenList)
// {
// if(child == target)
// continue;
// if(isNode(child))
// { //sth1
// boolean hasPrep = false;
// for(DependencyTreeNode grandson: child.childrenList)
// { //prep
// if(grandson.dep_father2child.equals("prep"))
// hasPrep = true;
// }
// //Detect modifier: who is the sht1's sth2?if(hasPrep || qlog.s.hasModifier(child.word))
// {
// target.word.represent = child.word;
// target = child;
// break;
// }
// }
// }
// }
// }
// }
// by sentence
if(target.word.baseForm.equals("who"))
if(target.word.baseForm.equals("谁"))
{
int curPos = target.word.position - 1;
// who is usually coreference when it not the first word.
if(curPos - 1 >= 0 && isNodeCandidate(words[curPos-1]))
// [Node]是(谁)
if(curPos - 2 >= 0 && isNodeCandidate(words[curPos-2]))
{
target.word.represent = words[curPos-1];
target = ds.getNodeByIndex(words[curPos-1].position);
// 谁 在末尾: 武汉大学的现任[校长]是(谁)
if(curPos == words.length - 1 && (words[curPos-1].baseForm.equals("是") || words[curPos-1].baseForm.equals("有")) )
{
target.word.represent = words[curPos-2];
target = ds.getNodeByIndex(words[curPos-2].position);
}
// [湖上草]是谁的(诗)
if(curPos + 2 == words.length-1 && words[curPos-1].baseForm.equals("是")
&& words[curPos+1].baseForm.equals("的") && isNodeCandidate(words[curPos+2]))
{
words[curPos+2].represent = words[curPos-2];
}
}
// Do nothing: [谁]的[女儿]嫁给了王思聪
}
}
//how
@@ -847,7 +762,7 @@ public class BuildQueryGraph
/*
* There are two cases of [ent]+[type]:1、Chinese company 2、De_Beer company;
* For 1, chinese -> company,for 2, De_Beer <- company
* Return: True : ent -> type | False : type <- ent
* Return: True : ent -> type | False : ent <- type
* */
public boolean checkModifyBetweenEntType(Word entWord, Word typeWord)
{
@@ -868,9 +783,9 @@ public class BuildQueryGraph
* Trough sentence rather than dependency tree as the latter often incorrect
* Generally a sequencial nodes always modify the last node, an exception is test case 3. So we apply recursive search method.
* test case:
* 1) the highest Chinese mountain
* 2) the Chinese popular director
* 3) the De_Beers company (company[type]-> De_Beers[ent])
* 1) 最高的中国山峰
* 2) 中国流行歌手
* 3) 谷歌公司 (company[type]-> De_Beers[ent])
* */
public Word getTheModifiedWordBySentence(Sentence s, Word curWord)
{
@@ -898,14 +813,14 @@ public class BuildQueryGraph
return curWord.modifiedWord = curWord;
}
//modify LEFT: ent + type(cur) : De_Beer company
//modify LEFT: ent + type(cur) : 谷歌 公司
if(preWord != null && curWord.mayType && preWord.mayEnt) //ent + type(cur)
{
if(!checkModifyBetweenEntType(preWord, curWord)) //De_Beer <- company, 注意此时即使type后面还连着node,也不理会了
return curWord.modifiedWord = preWord;
}
//modify itself: ent(cur) + type : De_Beer company
//modify itself: ent(cur) + type : 谷歌 公司
if(nextModifiedWord != null && curWord.mayEnt && nextModifiedWord.mayType)
{
if(!checkModifyBetweenEntType(curWord, nextModifiedWord))


+ 11
- 30
src/qa/parsing/QuestionParsing.java View File

@@ -16,36 +16,20 @@ public class QuestionParsing {
}
public void getDependenciesAndNER (QueryLogger qlog) {
long t1 = System.currentTimeMillis();
try {
long t1 = System.currentTimeMillis();
qlog.s.dependencyTreeStanford = new DependencyTree(qlog.s, Globals.stanfordParser);
}catch(Exception e){
e.printStackTrace();
}
long t2 = System.currentTimeMillis();
try{
qlog.s.dependencyTreeMalt = new DependencyTree(qlog.s, Globals.maltParser);
}catch(Exception e){
//if errors occur, abandon malt tree
qlog.s.dependencyTreeMalt = qlog.s.dependencyTreeStanford;
System.err.println("MALT parser error! Use stanford parser instead.");
}
try {
long t3 = System.currentTimeMillis();
Globals.nerRecognizer.recognize(qlog.s);
long t4 = System.currentTimeMillis();
long t2 = System.currentTimeMillis();
// Globals.nerRecognizer.recognize(qlog.s); //TODO: check NER
System.out.println("====StanfordDependencies("+(t2-t1)+"ms)====");
System.out.println(qlog.s.dependencyTreeStanford);
System.out.println("====MaltDependencies("+(t3-t2)+"ms)====");
System.out.println(qlog.s.dependencyTreeMalt);
System.out.println("====NameEntityRecognition("+(t4-t3)+"ms)====");
qlog.s.printNERResult();
// qlog.s.printNERResult();
qlog.timeTable.put("StanfordParser", (int)(t2-t1));
qlog.timeTable.put("MaltParser", (int)(t3-t2));
qlog.timeTable.put("NER", (int)(t4-t3));
} catch (Exception e) {
e.printStackTrace();
}
@@ -53,8 +37,7 @@ public class QuestionParsing {
public void recognizeSentenceType(QueryLogger qlog)
{
boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford)||
recognizeImperativeSentence(qlog.s.dependencyTreeMalt);
boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford);
if (IsImperativeSentence)
{
qlog.s.sentenceType = SentenceType.ImperativeSentence;
@@ -66,16 +49,14 @@ public class QuestionParsing {
return;
}
boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford)||
recognizeSpecialQuestion(qlog.s.dependencyTreeMalt);
boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford);
if (IsSpecialQuestion)
{
qlog.s.sentenceType = SentenceType.SpecialQuestion;
return;
}
boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford)||
recognizeGeneralQuestion(qlog.s.dependencyTreeMalt);
boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford);
if (IsGeneralQuestion)
{
qlog.s.sentenceType = SentenceType.GeneralQuestion;


+ 0
- 41
src/rdf/MergedWord.java View File

@@ -1,41 +0,0 @@
package rdf;

import java.util.ArrayList;

import rdf.EntityMapping;
import rdf.TypeMapping;

public class MergedWord implements Comparable<MergedWord>
{
//original position
public int st,ed;
//position after merge (unselected is -1)
public int mergedPos = -1;
public String name;
public boolean mayCategory = false;
public boolean mayLiteral = false;
public boolean mayEnt = false;
public boolean mayType = false;
public ArrayList<EntityMapping> emList = null;
public ArrayList<TypeMapping> tmList = null;
public String category = null;
public MergedWord(int s,int e,String n)
{
st = s;
ed = e;
name = n;
}
@Override
//long to short
public int compareTo(MergedWord o)
{
int lenDiff = (this.ed-this.st) - (o.ed-o.st);
if (lenDiff > 0) return -1;
else if (lenDiff < 0) return 1;
return 0;
}
}

+ 1
- 1
src/rdf/SimpleRelation.java View File

@@ -65,7 +65,7 @@ public class SimpleRelation {
}
sumSelectivity = matchingScore*sumSelectivity*pidsup.support;
int pid = pidsup.predicateID;
if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5;
// if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5;
if (!pasList.containsKey(pid))
pasList.put(pid, sumSelectivity);


Loading…
Cancel
Save