diff --git a/src/addition/AddtionalFix.java b/src/addition/AddtionalFix.java index 7c82982..a9dc236 100644 --- a/src/addition/AddtionalFix.java +++ b/src/addition/AddtionalFix.java @@ -20,16 +20,10 @@ public class AddtionalFix public AddtionalFix() { - // Some category mappings for DBpedia, try automatic linking methods later. | base form - pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters"); - pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World"); - pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus"); - pattern2category.put("13_british_colony", "Thirteen_Colonies"); } public void process(QueryLogger qlog) { - fixCategory(qlog); oneTriple(qlog); oneNode(qlog); @@ -48,45 +42,10 @@ public class AddtionalFix spq.queryType = QueryType.Ask; } - public void fixCategory(QueryLogger qlog) - { - if(qlog == null || qlog.semanticUnitList == null) - return; - - String var = null, category = null; - for(SemanticUnit su: qlog.semanticUnitList) - { - if(su.centerWord.mayCategory) - { - var = "?"+su.centerWord.originalForm; - category = su.centerWord.category; - } - } - - if(category != null && var != null) - for(Sparql spq: qlog.rankedSparqls) - { - boolean occured = false; - for(Triple tri: spq.tripleList) - { - if(tri.subject.equals(var)) - { - occured = true; - break; - } - } - String oName = category; - String pName = "subject"; - int pid = Globals.pd.predicate_2_id.get(pName); - Triple triple = new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100); - spq.addTriple(triple); - } - } - /* recognize one-Node query * Two cases:1、Special question|Imperative sentence 2、General question * 1-1:how many [], highest [] ... | For single variable, add constraint (aggregation) - * 1-2: What is backgammon? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...) + * 1-2: 谁是狄仁杰? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...) * 1-3: Give me all Seven Wonders of the Ancient World. | Notice, "Seven Wonders of the Ancient World" should be recognized as ENT before. (in fact it is CATEGORY in DBpeida) * 2-1: Are there any [castles_in_the_United_States](yago:type) * 2-2:Was Sigmund Freud married? | Lack of variable node. @@ -101,7 +60,7 @@ public class AddtionalFix Word[] words = qlog.s.words; if(qlog.s.sentenceType != SentenceType.GeneralQuestion) { - //1-1: how many [type] are there | List all [type] + //1-1: 有多少[type] | 列出所有[type] if(target.mayType && target.tmList != null) { String subName = "?"+target.originalForm; @@ -111,10 +70,10 @@ public class AddtionalFix sparql.addTriple(triple); qlog.rankedSparqls.add(sparql); } - //1-2: What is [ent]? else if(target.mayEnt && target.emList != null) { - if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) + //1-2: 什么是[ent] + if(words.length >= 3 && (words[0].baseForm.equals("什么") || words[0].baseForm.equals("谁")) && words[1].baseForm.equals("是")) { int eid = target.emList.get(0).entityID; String subName = target.emList.get(0).entityName; @@ -123,24 +82,14 @@ public class AddtionalFix sparql.addTriple(triple); qlog.rankedSparqls.add(sparql); } - } - //1-3: Give me all Seven Wonders of the Ancient World. - else if(target.mayCategory && target.category != null) - { - String oName = target.category; - String pName = "subject"; - int pid = Globals.pd.predicate_2_id.get(pName); - Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100); - Sparql sparql = new Sparql(); - sparql.addTriple(triple); - qlog.rankedSparqls.add(sparql); + //1-3: [ent] with other relations } } - else + else { if(target.mayEnt && target.emList != null) { - //2-2:Was Sigmund Freud married? + //2-2:[ent]结婚了吗? String relMention = ""; for(Word word: words) if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?")) @@ -162,34 +111,6 @@ public class AddtionalFix sparql.addTriple(triple); qlog.rankedSparqls.add(sparql); } - - //2-3:Are penguins endangered? - else - { - if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm)) - { - String oName = pattern2category.get(words[target.position].baseForm); - String pName = "subject"; - int pid = Globals.pd.predicate_2_id.get(pName); - int eid = target.emList.get(0).entityID; - String subName = target.emList.get(0).entityName; - Triple triple = new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100); - Sparql sparql = new Sparql(); - sparql.addTriple(triple); - qlog.rankedSparqls.add(sparql); - } - } - } - //2-1: Are there any [castles_in_the_United_States](yago:type) - else if(target.mayType && target.tmList != null) - { - String typeName = target.tmList.get(0).typeName; - String subName = "?" + target.originalForm; - //System.out.println("typeName="+typeName+" subName="+subName); - Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); - Sparql sparql = new Sparql(); - sparql.addTriple(triple); - qlog.rankedSparqls.add(sparql); } } } diff --git a/src/fgmt/RelationFragment.java b/src/fgmt/RelationFragment.java index 05332a4..24c539a 100644 --- a/src/fgmt/RelationFragment.java +++ b/src/fgmt/RelationFragment.java @@ -46,7 +46,9 @@ public class RelationFragment extends Fragment public static void load() throws Exception { - String filename = Globals.localPath + "data/DBpedia2016/fragments/predicate_RDF_fragment/predicate_fragment.txt"; + System.out.println("Loading relation IDs and Fragments ..."); + + String filename = Globals.localPath + "data/pkubase/fragments/pkubase_predicate_fragment.txt"; List inputs = FileUtil.readFile(filename); relFragments = new HashMap>(); literalRelationSet = new HashSet(); @@ -72,7 +74,7 @@ public class RelationFragment extends Fragment public static void loadId() throws IOException { - String filename = Globals.localPath + "data/DBpedia2016/fragments/id_mappings/16predicate_id.txt"; + String filename = Globals.localPath + "data/pkubase/fragments/id_mappings/pkubase_predicate_id.txt"; List inputs = FileUtil.readFile(filename); relationShortName2IdList = new HashMap>(); diff --git a/src/fgmt/TypeFragment.java b/src/fgmt/TypeFragment.java index 2c5bc10..29938cc 100644 --- a/src/fgmt/TypeFragment.java +++ b/src/fgmt/TypeFragment.java @@ -19,8 +19,6 @@ public class TypeFragment extends Fragment { public static HashMap typeId2ShortName = null; public static final int NO_RELATION = -24232; - public static HashSet yagoTypeList = null; - public HashSet inEdges = new HashSet(); public HashSet outEdges = new HashSet(); public HashSet entSet = new HashSet(); @@ -33,26 +31,6 @@ public class TypeFragment extends Fragment { * 4, others: peace、vice */ public static ArrayList stopYagoTypeList = null; - static void loadStopYagoTypeList() - { - stopYagoTypeList = new ArrayList(); - stopYagoTypeList.add("Amazon"); - stopYagoTypeList.add("Earth"); - stopYagoTypeList.add("TheHungerGames"); - stopYagoTypeList.add("SparklingWine"); - stopYagoTypeList.add("Type"); - stopYagoTypeList.add("Flow"); - stopYagoTypeList.add("Owner"); - stopYagoTypeList.add("Series"); - stopYagoTypeList.add("Shot"); - stopYagoTypeList.add("Part"); - stopYagoTypeList.add("Care"); - stopYagoTypeList.add("Peace"); - stopYagoTypeList.add("Vice"); - stopYagoTypeList.add("Dodo"); - stopYagoTypeList.add("CzechFilms"); - stopYagoTypeList.add("ChineseFilms"); - } public TypeFragment(String fgmt, int fid) { @@ -100,7 +78,7 @@ public class TypeFragment extends Fragment { public static void load() throws Exception { - String filename = Globals.localPath+"data/DBpedia2016/fragments/class_RDF_fragment/16type_fragment.txt"; + String filename = Globals.localPath+"data/pkubase/fragments/pkubase_type_fragment.txt"; File file = new File(filename); InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); @@ -128,14 +106,13 @@ public class TypeFragment extends Fragment { // can fix some data there // load Type Id loadId(); - System.out.println("Load "+typeId2ShortName.size()+" basic types and "+yagoTypeList.size()+" yago types."); + System.out.println("Load "+typeId2ShortName.size()+" basic types."); } public static void loadId() throws IOException { - String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16basic_types_id.txt"; - String yagoFileName = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16yago_types_list.txt"; - + String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkubase_type_id.txt"; + File file = new File(filename); InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); BufferedReader br = new BufferedReader(in); @@ -161,19 +138,5 @@ public class TypeFragment extends Fragment { typeId2ShortName.put(RelationFragment.literalTypeId, "literal_HRZ"); br.close(); - - //load YAGO types - in = new InputStreamReader(new FileInputStream(yagoFileName),"utf-8"); - br = new BufferedReader(in); - yagoTypeList = new HashSet(); - while((line = br.readLine())!=null) - { - String[] lines = line.split("\t"); - String typeName = lines[0]; - yagoTypeList.add(typeName); - } - - loadStopYagoTypeList(); - yagoTypeList.removeAll(stopYagoTypeList); } } diff --git a/src/lcn/BuildIndexForEntityFragments.java b/src/lcn/BuildIndexForEntityFragments.java deleted file mode 100644 index 0a2ec16..0000000 --- a/src/lcn/BuildIndexForEntityFragments.java +++ /dev/null @@ -1,119 +0,0 @@ -package lcn; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.util.Date; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexWriter; - -import qa.Globals; - - -public class BuildIndexForEntityFragments{ - public void indexforentity() throws Exception - { - if(EntityFragmentFields.entityId2Name == null) - EntityFragmentFields.load(); - - long startTime = new Date().getTime(); - - //Try update KB index to DBpedia2015. by husen 2016-04-08 - //Try update KB index to DBpedia2016. by husen 2018-8-22 - File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index"); - File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"); - - Analyzer luceneAnalyzer_en = new StandardAnalyzer(); - IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); - - int mergeFactor = 100000; //default 10 - int maxBufferedDoc = 1000; //default 10 - int maxMergeDoc = Integer.MAX_VALUE; //INF - - //indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; - indexWriter_en.setMergeFactor(mergeFactor); - indexWriter_en.setMaxBufferedDocs(maxBufferedDoc); - indexWriter_en.setMaxMergeDocs(maxMergeDoc); - - - FileInputStream file = new FileInputStream(sourceDir_en); - InputStreamReader in = new InputStreamReader(file,"UTF-8"); - BufferedReader br = new BufferedReader(in); - - int count = 0; - while(true) - { - String _line = br.readLine(); - { - if(_line == null) break; - } - count++; - if(count % 100000 == 0) - System.out.println(count); - - String line = _line; - String temp[] = line.split("\t"); - - if(temp.length != 2) - continue; - else - { - int entity_id = Integer.parseInt(temp[0]); - if(!EntityFragmentFields.entityId2Name.containsKey(entity_id)) - continue; - - String entity_name = EntityFragmentFields.entityId2Name.get(entity_id); - String entity_fragment = temp[1]; - entity_name = entity_name.replace("____", " "); - entity_name = entity_name.replace("__", " "); - entity_name = entity_name.replace("_", " "); - - - Document document = new Document(); - - Field EntityName = new Field("EntityName", entity_name, Field.Store.YES, - Field.Index.TOKENIZED, - Field.TermVector.WITH_POSITIONS_OFFSETS); - Field EntityId = new Field("EntityId", String.valueOf(entity_id), - Field.Store.YES, Field.Index.NO); - Field EntityFragment = new Field("EntityFragment", entity_fragment, - Field.Store.YES, Field.Index.NO); - - document.add(EntityName); - document.add(EntityId); - document.add(EntityFragment); - indexWriter_en.addDocument(document); - } - } - - indexWriter_en.optimize(); - indexWriter_en.close(); - br.close(); - - // input the time of Build index - long endTime = new Date().getTime(); - System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime)); - } - - public static void main(String[] args) - { - BuildIndexForEntityFragments bef = new BuildIndexForEntityFragments(); - - try - { - Globals.localPath="D:/husen/gAnswer/"; - bef.indexforentity(); - } - catch (Exception e) - { - e.printStackTrace(); - } - } -} - - diff --git a/src/lcn/BuildIndexForTypeShortName.java b/src/lcn/BuildIndexForTypeShortName.java deleted file mode 100644 index 78b55f7..0000000 --- a/src/lcn/BuildIndexForTypeShortName.java +++ /dev/null @@ -1,107 +0,0 @@ -package lcn; - -import java.io.File; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashMap; -import java.util.Iterator; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.index.IndexWriter; - -import qa.Globals; -import fgmt.TypeFragment; - -public class BuildIndexForTypeShortName { - public static void buildIndex(HashMap> typeShortName2IdList) throws Exception - { - long startTime = new Date().getTime(); - File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index"); - - Analyzer luceneAnalyzer_li = new StandardAnalyzer(); - IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); - - int mergeFactor = 100000; - int maxBufferedDoc = 1000; - int maxMergeDoc = Integer.MAX_VALUE; - - //indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; - indexWriter_li.setMergeFactor(mergeFactor); - indexWriter_li.setMaxBufferedDocs(maxBufferedDoc); - indexWriter_li.setMaxMergeDocs(maxMergeDoc); - - int count = 0; - Iterator it = typeShortName2IdList.keySet().iterator(); - while (it.hasNext()) - { - String sn = it.next(); - if (sn.length() == 0) { - continue; - } - - count ++; - - StringBuilder splittedSn = new StringBuilder(""); - - if(sn.contains("_")) - { - String nsn = sn.replace("_", " "); - splittedSn.append(nsn.toLowerCase()); - } - else - { - int last = 0, i = 0; - for(i = 0; i < sn.length(); i ++) - { - // if it were not a small letter, then break it. - if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) - { - splittedSn.append(sn.substring(last, i).toLowerCase()); - splittedSn.append(' '); - last = i; - } - } - splittedSn.append(sn.substring(last, i).toLowerCase()); - while(splittedSn.charAt(0) == ' ') { - splittedSn.deleteCharAt(0); - } - } - - System.out.println("SplitttedType: "+splittedSn); - - Document document = new Document(); - - Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), - Field.Store.YES, - Field.Index.TOKENIZED, - Field.TermVector.WITH_POSITIONS_OFFSETS); - Field TypeShortName = new Field("TypeShortName", sn, - Field.Store.YES, Field.Index.NO); - - document.add(SplittedTypeShortName); - document.add(TypeShortName); - indexWriter_li.addDocument(document); - } - - indexWriter_li.optimize(); - indexWriter_li.close(); - - // input the time of Build index - long endTime = new Date().getTime(); - System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime)); - } - - public static void main (String[] args) { - try { - Globals.localPath="D:/husen/gAnswer/"; - TypeFragment.load(); - BuildIndexForTypeShortName.buildIndex(TypeFragment.typeShortName2IdList); - } catch (Exception e) { - e.printStackTrace(); - } - } - -} diff --git a/src/lcn/EntityFragmentFields.java b/src/lcn/EntityFragmentFields.java index 0b1a873..edae368 100644 --- a/src/lcn/EntityFragmentFields.java +++ b/src/lcn/EntityFragmentFields.java @@ -5,9 +5,13 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; +import fgmt.EntityFragment; import qa.Globals; +import utils.FileUtil; public class EntityFragmentFields { @@ -18,8 +22,8 @@ public class EntityFragmentFields { public static void load() throws IOException { - String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16entity_id.txt"; - String fragmentFileName = Globals.localPath+"data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"; + String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkubase_entity_id.txt"; + String fragmentFileName = Globals.localPath+"data/pkubase/fragments/pkubase_entity_fragment.txt"; File file = new File(filename); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); @@ -35,7 +39,7 @@ public class EntityFragmentFields { while((line = br.readLine()) != null) { String[] lines = line.split("\t"); - String entName = lines[0].substring(1, lines[0].length()-1); + String entName = lines[0].trim().substring(1, lines[0].length()-1); entityName2Id.put(entName, Integer.parseInt(lines[1])); entityId2Name.put(Integer.parseInt(lines[1]), entName); @@ -61,4 +65,41 @@ public class EntityFragmentFields { br.close(); } + + public static void genmini() + { + String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkuentity_id.txt"; + String fragmentFileName = Globals.localPath+"data/pkubase/fragments/pkubase_entity_fragment_mini.txt"; + List fragments = FileUtil.readFile(fragmentFileName); + ArrayList eids = new ArrayList(); + for(String fragment: fragments) + { + int eid = Integer.parseInt(fragment.split("\t")[0]); + String fgmt = fragment.split("\t")[1]; + EntityFragment ef = new EntityFragment(eid, fgmt); + eids.add(eid); + for(int ent: ef.inEntMap.keySet()) + { + eids.add(ent); + } + for(int ent: ef.outEntMap.keySet()) + { + eids.add(ent); + } + } + System.out.println(eids.size()); + System.out.println("Loading entity id ..."); + List data = FileUtil.readFile(filename); + for(String line: data) + { + String[] lines = line.split("\t"); + int eid = Integer.parseInt(lines[1]); + if(eids.contains(eid)) + System.out.println(line); + } + } + + public static void main(String[] args) { + EntityFragmentFields.genmini(); + } } diff --git a/src/log/QueryLogger.java b/src/log/QueryLogger.java index 454c37d..28b56e3 100644 --- a/src/log/QueryLogger.java +++ b/src/log/QueryLogger.java @@ -12,7 +12,6 @@ import qa.Query; import rdf.EntityMapping; import rdf.SemanticRelation; import rdf.Sparql; -import rdf.MergedWord; import rdf.SemanticUnit; import qa.Answer; import nlp.ds.Sentence; @@ -30,10 +29,8 @@ public class QueryLogger { public boolean MODE_debug = false; public boolean MODE_log = true; public boolean MODE_fragment = true; - public boolean isMaltParserUsed = true; // Notice, we utilize Malt Parser as default parser, which is different from the older version. TODO: some coref rules need changed to fit Malt Parser. - + public boolean isMaltParserUsed = false; // MaltParser id deprecated. public HashMap timeTable = null; - public ArrayList mWordList = null; public ArrayList semanticUnitList = null; public HashMap semanticRelations = null; public HashMap potentialSemanticRelations = null; @@ -48,7 +45,6 @@ public class QueryLogger { { timeTable = new HashMap(); rankedSparqls = new ArrayList(); - mWordList = query.mWordList; } public void reloadSentence(Sentence sentence) diff --git a/src/nlp/ds/DependencyTree.java b/src/nlp/ds/DependencyTree.java index 6169f62..6bdb736 100644 --- a/src/nlp/ds/DependencyTree.java +++ b/src/nlp/ds/DependencyTree.java @@ -6,75 +6,37 @@ import java.util.HashMap; import java.util.List; import java.util.Stack; -import nlp.tool.CoreNLP; -import nlp.tool.MaltParser; import nlp.tool.StanfordParser; - -import org.maltparser.core.exception.MaltChainedException; -import org.maltparser.core.syntaxgraph.DependencyStructure; -import org.maltparser.core.syntaxgraph.node.DependencyNode; - +import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.IndexedWord; +import edu.stanford.nlp.ling.SentenceUtils; import edu.stanford.nlp.trees.GrammaticalStructure; import edu.stanford.nlp.trees.TypedDependency; -import edu.stanford.nlp.trees.semgraph.SemanticGraph; public class DependencyTree { public DependencyTreeNode root = null; public ArrayList nodesList = null; - public SemanticGraph dependencies = null; // Method 1: CoreNLP (discarded) - public GrammaticalStructure gs = null; // Method 2: Stanford Parser - public DependencyStructure maltGraph = null; // Method 3: MaltParser +// public GrammaticalStructure gs = null; // Method 2: Stanford Parser public HashMap> wordBaseFormIndex = null; - public DependencyTree (Sentence sentence, CoreNLP coreNLPparser) { - SemanticGraph dependencies = coreNLPparser.getBasicDependencies(sentence.plainText); - this.dependencies = dependencies; - - Stack stack = new Stack(); - IndexedWord iwRoot = dependencies.getFirstRoot(); - - HashMap map = new HashMap(); - nodesList = new ArrayList(); - - stack.push(iwRoot); - root = this.setRoot(sentence.getWordByIndex(iwRoot.index())); - map.put(iwRoot, root); - - while (!stack.empty()) - { - IndexedWord curIWNode = stack.pop(); - DependencyTreeNode curDTNode = map.get(curIWNode); - - for (IndexedWord iwChild : dependencies.getChildList(curIWNode)) { - Word w = sentence.getWordByIndex(iwChild.index()); - DependencyTreeNode newDTNode = this.insert( - curDTNode, - w, - dependencies.reln(curIWNode, iwChild).getShortName()); - map.put(iwChild, newDTNode); - stack.push(iwChild); - } - - curDTNode.sortChildrenList(); - nodesList.add(curDTNode); - } - } - public DependencyTree (Sentence sentence, StanfordParser stanfordParser) { - this.gs = stanfordParser.getGrammaticalStructure(sentence.plainText); - + HashMap map = new HashMap(); nodesList = new ArrayList(); - List tdl = gs.typedDependencies(false); +// String[] sent = { "这", "是", "一个", "简单", "的", "句子", "。" }; + String[] sent = sentence.getWordsArr(); + List rawWords = SentenceUtils.toCoreLabelList(sent); + List tdl = stanfordParser.getTypedDependencyList(rawWords); + // 1. generate all nodes. for (TypedDependency td : tdl) { // gov if (!map.containsKey(td.gov().index()) && !td.reln().getShortName().equals("root")) { Word w = sentence.getWordByIndex(td.gov().index()); + w.posTag = td.gov().tag(); // POS TAG DependencyTreeNode newNode = new DependencyTreeNode(w); map.put(td.gov().index(), newNode); nodesList.add(newNode); @@ -82,6 +44,7 @@ public class DependencyTree { // dep if (!map.containsKey(td.dep().index())) { Word w = sentence.getWordByIndex(td.dep().index()); + w.posTag = td.dep().tag(); // POS TAG DependencyTreeNode newNode = new DependencyTreeNode(w); map.put(td.dep().index(), newNode); nodesList.add(newNode); @@ -118,139 +81,9 @@ public class DependencyTree { } } Collections.sort(nodesList, new DependencyTreeNodeComparator()); - for (DependencyTreeNode dtn : nodesList) { - dtn.linkNN(this); - } - } - - public DependencyTree (Sentence sentence, MaltParser maltParser)throws MaltChainedException { - try { - // the tokens are parsed in the following line - DependencyStructure graph = maltParser.getDependencyStructure(sentence); - this.maltGraph = graph; - //System.out.println(graph); - - HashMap map = new HashMap(); - ArrayList list = new ArrayList(); - Stack stack = new Stack(); - DependencyNode nroot = graph.getDependencyRoot(); - stack.add(nroot); - // 1. generate all nodes. - while (!stack.isEmpty()) { - DependencyNode n = stack.pop(); - DependencyNode sib = n.getRightmostDependent(); - int key = n.getIndex(); - //System.out.println("[current node][key="+key+"] "+n+" <"+n.getHeadEdge()+">"); - boolean flag = true; - while (sib != null) { - flag = false; - stack.push(sib); - sib = sib.getLeftSibling(); - } - if (flag) { - sib = n.getLeftmostDependent(); - while (sib != null) { - stack.push(sib); - sib = sib.getRightSibling(); - } - } - if (n.hasHead() && !map.containsKey(key)) { - //String snode = n.toString(); - String sedge = n.getHeadEdge().toString(); - //System.out.println("[" + snode + "] <" + sedge + ">"); - - /*int position = 0; - String wordOriginal = null; - String wordBase; - String postag = null;*/ - String dep = null; - int idx1, idx2; - - /*// position - idx1 = snode.indexOf("ID:")+3; - idx2 = snode.indexOf(' ', idx1); - position = Integer.parseInt(snode.substring(idx1, idx2)); - - // word - idx1 = snode.indexOf("FORM:", idx2)+5; - idx2 = snode.indexOf(' ', idx1); - wordOriginal = snode.substring(idx1, idx2); - wordBase = Globals.coreNLP.getBaseFormOfPattern(wordOriginal.toLowerCase()); - - // postag - idx1 = snode.indexOf("POSTAG:", idx2)+7; - idx2 = snode.indexOf(' ', idx1); - postag = snode.substring(idx1, idx2);*/ - - // dep - idx1 = sedge.lastIndexOf(':')+1; - idx2 = sedge.lastIndexOf(' '); - dep = sedge.substring(idx1, idx2); - if (dep.equals("null")) { - dep = null; - } - else if (dep.equals("punct")) {// No consider about punctuation - continue; - } - - DependencyTreeNode newNode = new DependencyTreeNode(sentence.getWordByIndex(key)); - newNode.dep_father2child = dep; - map.put(key, newNode); - list.add(newNode); - } - } - - - // 2. add edges - for (Integer k : map.keySet()) { - DependencyNode n = graph.getDependencyNode(k); - DependencyTreeNode dtn = map.get(k); - if (dtn.dep_father2child == null) { - this.setRoot(dtn); - this.root.levelInTree = 0; - this.root.dep_father2child = "root"; - } - else { - DependencyTreeNode father = map.get(n.getHead().getIndex()); - DependencyTreeNode child = map.get(n.getIndex()); - child.father = father; - father.childrenList.add(child); - } - } - - // Fix the tree for some cases. - if(list.size() > 11) - { - DependencyTreeNode dt1 = list.get(11), dt2 = list.get(5); - if(dt1!=null && dt2!=null && dt1.word.baseForm.equals("star") && dt1.father.word.baseForm.equals("be")) - { - if (dt2.word.baseForm.equals("film") || dt2.word.baseForm.equals("movie")) - { - dt1.father.childrenList.remove(dt1); - dt1.father = dt2; - dt2.childrenList.add(dt1); - } - } - } - - // add levelInTree, sort childrenList & nodesList - for (DependencyTreeNode dtn : list) { - if (dtn.father != null) { - dtn.levelInTree = dtn.father.levelInTree + 1; - dtn.sortChildrenList(); - } - } - - nodesList = list; - Collections.sort(nodesList, new DependencyTreeNodeComparator()); - for (DependencyTreeNode dtn : nodesList) { - dtn.linkNN(this); - } - } catch (MaltChainedException e) { - //e.printStackTrace(); - //System.err.println("MaltParser exception: " + e.getMessage()); - throw e; - } +// for (DependencyTreeNode dtn : nodesList) { +// dtn.linkNN(this); +// } } public DependencyTreeNode setRoot(Word w) { diff --git a/src/nlp/ds/Sentence.java b/src/nlp/ds/Sentence.java index 8f95b27..1880e8c 100644 --- a/src/nlp/ds/Sentence.java +++ b/src/nlp/ds/Sentence.java @@ -2,10 +2,10 @@ package nlp.ds; import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import qa.Globals; import qa.Query; -import rdf.MergedWord; public class Sentence { public String plainText = null; @@ -18,40 +18,64 @@ public class Sentence { public enum SentenceType {SpecialQuestion,GeneralQuestion,ImperativeSentence} public SentenceType sentenceType = SentenceType.SpecialQuestion; - public Sentence (String s) +// public Sentence (String s) +// { +// plainText = s; +// words = Globals.coreNLP.getTaggedWords(plainText); +// map = new HashMap(); +// for (Word w : words) +// map.put(w.key, w); +// } + + // for tokenized sentence + public Sentence (List wordList, String s) { plainText = s; - words = Globals.coreNLP.getTaggedWords(plainText); + words = new Word[wordList.size()]; + for(int i=0; i(); for (Word w : words) map.put(w.key, w); } - public Sentence (Query query, String s) - { - plainText = s; - words = Globals.coreNLP.getTaggedWords(plainText); - // inherit NodeRecognition's information - for(Word word: words) +// public Sentence (Query query, String s) +// { +// plainText = s; +// words = Globals.coreNLP.getTaggedWords(plainText); +// // inherit NodeRecognition's information +// for(Word word: words) +// { +// for(MergedWord mWord: query.mWordList) +// { +// if(word.originalForm.equals(mWord.name)) +// { +// word.mayLiteral = mWord.mayLiteral; +// word.mayEnt = mWord.mayEnt; +// word.mayType = mWord.mayType; +// word.mayCategory = mWord.mayCategory; +// word.tmList = mWord.tmList; +// word.emList = mWord.emList; +// word.category = mWord.category; +// } +// } +// } +// map = new HashMap(); +// for (Word w : words) +// map.put(w.key, w); +// } + + public String[] getWordsArr() { + String[] wordArr = new String[words.length]; + int cnt = 0; + for(Word w: words) { - for(MergedWord mWord: query.mWordList) - { - if(word.originalForm.equals(mWord.name)) - { - word.mayLiteral = mWord.mayLiteral; - word.mayEnt = mWord.mayEnt; - word.mayType = mWord.mayType; - word.mayCategory = mWord.mayCategory; - word.tmList = mWord.tmList; - word.emList = mWord.emList; - word.category = mWord.category; - } - } + wordArr[cnt++] = w.originalForm; } - map = new HashMap(); - for (Word w : words) - map.put(w.key, w); + return wordArr; } + public ArrayList getWordsByString (String w) { ArrayList ret = new ArrayList(); for (Word wo: words) { diff --git a/src/nlp/tool/CoreNLP.java b/src/nlp/tool/CoreNLP.java deleted file mode 100644 index 3905fda..0000000 --- a/src/nlp/tool/CoreNLP.java +++ /dev/null @@ -1,201 +0,0 @@ -package nlp.tool; - -import java.util.List; -import java.util.Properties; - -import nlp.ds.Word; -import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.Annotation; -import edu.stanford.nlp.pipeline.StanfordCoreNLP; -import edu.stanford.nlp.trees.Tree; -import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; -import edu.stanford.nlp.trees.semgraph.SemanticGraph; -import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; -import edu.stanford.nlp.util.CoreMap; - -public class CoreNLP { - - // CoreNLP can also recognize TIME and NUMBER (see SUTime) - private StanfordCoreNLP pipeline_lemma; - - public CoreNLP () { - // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution - /*Properties props_all = new Properties(); - props_all.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // full list: "tokenize, ssplit, pos, lemma, ner, parse, dcoref" - pipeline_all = new StanfordCoreNLP(props_all);*/ - - Properties props_lemma = new Properties(); - props_lemma.put("annotators", "tokenize, ssplit, pos, lemma"); - pipeline_lemma = new StanfordCoreNLP(props_lemma); - - } - - // For more efficient usage, refer to "http://www.jarvana.com/jarvana/view/edu/stanford/nlp/stanford-corenlp/1.2.0/stanford-corenlp-1.2.0-javadoc.jar!/edu/stanford/nlp/process/Morphology.html" - public String getBaseFormOfPattern (String text) { - String ret = new String(""); - - // create an empty Annotation just with the given text - Annotation document = new Annotation(text); - // run all Annotators on this text - pipeline_lemma.annotate(document); - - - // these are all the sentences in this document - // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types - List sentences = document.get(SentencesAnnotation.class); - - int count = 0; - for(CoreMap sentence: sentences) { - // traversing the words in the current sentence - // a CoreLabel is a CoreMap with additional token-specific methods - for (CoreLabel token: sentence.get(TokensAnnotation.class)) { - // this is the base form (lemma) of the token - String lemma = token.getString(LemmaAnnotation.class); - ret += lemma; - ret += " "; - } - count ++; - if (count % 100 == 0) { - System.out.println(count); - } - } - - return ret.substring(0, ret.length()-1); - } - - public SemanticGraph getBasicDependencies (String s) { - // create an empty Annotation just with the given text - Annotation document = new Annotation(s); - - // run all Annotators on this text - pipeline_lemma.annotate(document); - - // these are all the sentences in this document - // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types - List sentences = document.get(SentencesAnnotation.class); - - for(CoreMap sentence: sentences) { - // this is the Stanford dependency graph of the current sentence - SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class); - return dependencies; - } - - return null; - } - - public Tree getParseTree (String text) { - // create an empty Annotation just with the given text - Annotation document = new Annotation(text); - - // run all Annotators on this text - pipeline_lemma.annotate(document); - - // these are all the sentences in this document - // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types - List sentences = document.get(SentencesAnnotation.class); - - for(CoreMap sentence: sentences) { - // this is the parse tree of the current sentence - return sentence.get(TreeAnnotation.class); - } - - return null; - } - - /** - * How to use: - * for (CoreLabel token : sentence.get(TokensAnnotation.class)) { - * // this is the text of the token - * String word = token.get(TextAnnotation.class); - * // this is the POS tag of the token - * String pos = token.get(PartOfSpeechAnnotation.class); - * } - * @param s - * @return - */ - public CoreMap getPOS (String s) { - // create an empty Annotation just with the given text - Annotation document = new Annotation(s); - - // run all Annotators on this text - pipeline_lemma.annotate(document); - - // these are all the sentences in this document - // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types - List sentences = document.get(SentencesAnnotation.class); - - for(CoreMap sentence: sentences) { - // this is the sentence with POS Tags - return sentence; - } - - return null; - } - - public Word[] getTaggedWords (String sentence) { - CoreMap taggedSentence = getPOS(sentence); - Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()]; - int count = 0; - for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) { - // this is the text of the token - String word = token.get(TextAnnotation.class); - // this is the POS tag of the token - String pos = token.get(PartOfSpeechAnnotation.class); - //System.out.println(word+"["+pos+"]"); - ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1); - count ++; - } - return ret; - } - - /*public void demo () { - // creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution - Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); - StanfordCoreNLP pipeline = new StanfordCoreNLP(props); - - // read some text in the text variable - String text = ... // Add your text here! - - // create an empty Annotation just with the given text - Annotation document = new Annotation(text); - - // run all Annotators on this text - pipeline.annotate(document); - - // these are all the sentences in this document - // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types - List sentences = document.get(SentencesAnnotation.class); - - for(CoreMap sentence: sentences) { - // traversing the words in the current sentence - // a CoreLabel is a CoreMap with additional token-specific methods - for (CoreLabel token: sentence.get(TokensAnnotation.class)) { - // this is the text of the token - String word = token.get(TextAnnotation.class); - // this is the POS tag of the token - String pos = token.get(PartOfSpeechAnnotation.class); - // this is the NER label of the token - String ne = token.get(NamedEntityTagAnnotation.class); - } - - // this is the parse tree of the current sentence - Tree tree = sentence.get(TreeAnnotation.class); - - // this is the Stanford dependency graph of the current sentence - SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); - } - - // This is the coreference link graph - // Each chain stores a set of mentions that link to each other, - // along with a method for getting the most representative mention - // Both sentence and token offsets start at 1! - Map graph = - document.get(CorefChainAnnotation.class); - }*/ -} diff --git a/src/nlp/tool/Main.java b/src/nlp/tool/Main.java index 1a680a3..c86ecff 100644 --- a/src/nlp/tool/Main.java +++ b/src/nlp/tool/Main.java @@ -21,13 +21,10 @@ public class Main { break; try { long t1 = System.currentTimeMillis(); - Sentence s = new Sentence(question); + Sentence s = null; DependencyTree dt = new DependencyTree(s, Globals.stanfordParser); System.out.println("====StanfordDependencies===="); System.out.println(dt); - DependencyTree dt2 = new DependencyTree(s, Globals.maltParser); - System.out.println("====MaltDependencies===="); - System.out.println(dt2); long t2 = System.currentTimeMillis(); System.out.println("time=" + (t2-t1) + "ms"); } catch (Exception e) { diff --git a/src/nlp/tool/MaltParser.java b/src/nlp/tool/MaltParser.java deleted file mode 100644 index 56e16bc..0000000 --- a/src/nlp/tool/MaltParser.java +++ /dev/null @@ -1,70 +0,0 @@ -package nlp.tool; - - -import nlp.ds.Sentence; -import nlp.ds.Word; - -import org.maltparser.MaltParserService; -import org.maltparser.core.exception.MaltChainedException; -import org.maltparser.core.syntaxgraph.DependencyStructure; - -import qa.Globals; - -public class MaltParser { - private MaltParserService service = null; - public MaltParser() { - try - { - System.out.print("Loading MaltParser ..."); - service = new MaltParserService(); - // Inititalize the parser model 'model0' and sets the working directory to '.' and sets the logging file to 'parser.log' - //service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w . -lfi parser.log"); - service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w "+Globals.localPath+"lib/maltparser-1.9.1 -lfi parser.log"); - firstParse(); - System.out.println("ok!"); - } catch (MaltChainedException e) { - e.printStackTrace(); - System.err.println("MaltParser exception: " + e.getMessage()); - } - } - - private void firstParse() { - String[] tokens = new String[12]; - tokens[0] = "1\tIn\t_\tIN\tIN\t_"; - tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; - tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; - tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; - tokens[4] = "5\tby\t_\tIN\tIN\t_"; - tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; - tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; - tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; - tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; - tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; - tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; - tokens[11] = "12\t?\t_\t.\t.\t_"; - try { - service.parse(tokens); - } catch (MaltChainedException e) { - e.printStackTrace(); - } - } - - public DependencyStructure getDependencyStructure (Sentence sentence) { - try { - return service.parse(getTaggedTokens(sentence)); - } catch (MaltChainedException e) { - e.printStackTrace(); - } - return null; - } - - private String[] getTaggedTokens (Sentence sentence) { - String[] ret = new String[sentence.words.length]; - int count = 0; - for (Word w : sentence.words) { - ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); - count ++; - } - return ret; - } -} diff --git a/src/nlp/tool/MaltParserCon.java b/src/nlp/tool/MaltParserCon.java deleted file mode 100644 index 02214d4..0000000 --- a/src/nlp/tool/MaltParserCon.java +++ /dev/null @@ -1,73 +0,0 @@ -package nlp.tool; - -import java.io.File; -import java.net.URL; - -import nlp.ds.Sentence; -import nlp.ds.Word; - -import org.maltparser.concurrent.ConcurrentMaltParserModel; -import org.maltparser.concurrent.ConcurrentMaltParserService; -import org.maltparser.concurrent.graph.ConcurrentDependencyGraph; -import org.maltparser.core.exception.MaltChainedException; -//import org.maltparser.core.syntaxgraph.DependencyStructure; - - -public class MaltParserCon { - private ConcurrentMaltParserModel model = null; - public ConcurrentDependencyGraph outputGraph = null; - - public MaltParserCon(){ - try{ - System.out.println("Loading Maltparser...\n"); - URL ModelURL = new File("output/engmalt.linear-1.7.mco").toURI().toURL(); - model = ConcurrentMaltParserService.initializeParserModel(ModelURL); - firstTest(); - System.out.println("ok!\n"); - }catch(Exception e){ - e.printStackTrace(); - System.err.println("MaltParser exception: " + e.getMessage()); - } - } - - private void firstTest(){ - String[] tokens = new String[12]; - tokens[0] = "1\tIn\t_\tIN\tIN\t_"; - tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; - tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; - tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; - tokens[4] = "5\tby\t_\tIN\tIN\t_"; - tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; - tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; - tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; - tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; - tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; - tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; - tokens[11] = "12\t?\t_\t.\t.\t_"; - try { - outputGraph = model.parse(tokens); - } catch (Exception e) { - e.printStackTrace(); - } - System.out.println(outputGraph); - } - - public ConcurrentDependencyGraph getDependencyStructure (Sentence sentence) { - try { - return model.parse(getTaggedTokens(sentence)); - } catch (MaltChainedException e) { - e.printStackTrace(); - } - return null; - } - - private String[] getTaggedTokens (Sentence sentence) { - String[] ret = new String[sentence.words.length]; - int count = 0; - for (Word w : sentence.words) { - ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); - count ++; - } - return ret; - } -} diff --git a/src/nlp/tool/NERecognizer.java b/src/nlp/tool/NERecognizer.java deleted file mode 100644 index 11928a3..0000000 --- a/src/nlp/tool/NERecognizer.java +++ /dev/null @@ -1,53 +0,0 @@ -package nlp.tool; - -import java.util.List; - -import qa.Globals; - -import nlp.ds.Sentence; -import nlp.ds.Word; - -import edu.stanford.nlp.ie.AbstractSequenceClassifier; -import edu.stanford.nlp.ie.crf.CRFClassifier; -import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation; -import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation; -import edu.stanford.nlp.ling.CoreLabel; - -public class NERecognizer { - - static String serializedClassifier; - static AbstractSequenceClassifier classifier; - //public static String localPath="E:\\Hanshuo\\gAnswer\\"; - - public NERecognizer() { - serializedClassifier = Globals.localPath+"lib/stanford-ner-2012-11-11/classifiers/english.all.3class.distsim.crf.ser.gz"; - classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); - } - - /*public NERecognizer(String basePath, boolean flag) { - serializedClassifier = "WEB-INF\\lib\\stanford-ner-2012-11-11\\stanford-ner-2012-11-11\\classifiers\\english.all.3class.distsim.crf.ser.gz"; - }*/ - - public void recognize(Sentence sentence) { - List lcl = classifier.classify(sentence.plainText).get(0); - for (CoreLabel cl : lcl) { - int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1; - Word w = sentence.getWordByIndex(position); - String ner = cl.get(AnswerAnnotation.class); - if (ner.equals("O")) w.ner = null; - else w.ner = ner; - } - } - - public static void main(String[] args) { - System.out.println("Test NER"); - Globals.init(); - - Sentence s = new Sentence("I go to school at Stanford University, which is located in California.");//"Which states of Germany are governed by the Social Democratic Party?" - Globals.nerRecognizer.recognize(s); - for (Word word : s.words) { - System.out.print(word + " "); - System.out.println("ner=" + word.ner); - } - } -} diff --git a/src/nlp/tool/StanfordParser.java b/src/nlp/tool/StanfordParser.java index 12e305c..8be592e 100644 --- a/src/nlp/tool/StanfordParser.java +++ b/src/nlp/tool/StanfordParser.java @@ -4,7 +4,6 @@ import java.io.StringReader; import java.util.List; import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.objectbank.TokenizerFactory; import edu.stanford.nlp.parser.lexparser.LexicalizedParser; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBTokenizer; @@ -13,39 +12,40 @@ import edu.stanford.nlp.trees.GrammaticalStructureFactory; import edu.stanford.nlp.trees.PennTreebankLanguagePack; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreebankLanguagePack; +import edu.stanford.nlp.trees.TypedDependency; +import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure; public class StanfordParser { private LexicalizedParser lp; - private TokenizerFactory tokenizerFactory; - private TreebankLanguagePack tlp; - private GrammaticalStructureFactory gsf; + private ChineseGrammaticalStructure gs; + +// private TokenizerFactory tokenizerFactory; +// private TreebankLanguagePack tlp; +// private GrammaticalStructureFactory gsf; public StanfordParser() { - lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); - tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); - tlp = new PennTreebankLanguagePack(); - gsf = tlp.grammaticalStructureFactory(); +// lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); +// tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); +// tlp = new PennTreebankLanguagePack(); +// gsf = tlp.grammaticalStructureFactory(); + + lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz"); } - public GrammaticalStructure getGrammaticalStructure (String sentence) { - List rawWords2 = - tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); - // Converts a Sentence/List/String into a Tree. - // In all circumstances, the input will be treated as a single sentence to be parsed. - Tree parse = lp.apply(rawWords2); - - return gsf.newGrammaticalStructure(parse); - /*List tdl = gs.typedDependencies(false); - for (TypedDependency td : tdl) { - System.out.println(td.reln().getShortName()+"("+td.gov()+","+td.dep()+")"); - System.out.println("gov="+td.gov() - +"\tgov.index=" - +td.gov().index() - +"\tgov.value=" - +td.gov().value() - +"\tgov.pos=" - +((TreeGraphNode)td.gov().parent()).value()); - }*/ - //System.out.println(tdl); +// public GrammaticalStructure getGrammaticalStructure (String sentence) { +// List rawWords2 = +// tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); +// +// Tree parse = lp.apply(rawWords2); +// +// return gsf.newGrammaticalStructure(parse); +// } + + public List getTypedDependencyList(List rawWords) + { + Tree parse = lp.apply(rawWords); + gs = new ChineseGrammaticalStructure(parse); + + return gs.typedDependenciesCCprocessed(); } } diff --git a/src/paradict/ParaphraseDictionary.java b/src/paradict/ParaphraseDictionary.java index 11c24bc..348566e 100644 --- a/src/paradict/ParaphraseDictionary.java +++ b/src/paradict/ParaphraseDictionary.java @@ -10,19 +10,17 @@ import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.List; +import com.huaban.analysis.jieba.SegToken; +import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; - - -import nlp.tool.CoreNLP; import qa.Globals; +import qa.extract.EntityRecognitionCh; public class ParaphraseDictionary { - public static String localDataPath; - public static String dbpedia_relation_paraphrases_baseform_withScore; - public static String dbpedia_relation_paraphrases_baseform_withScore_rerank; - public static String dbpedia_relation_paraphrases_handwrite; - public static String dbpedia_predicate_id; + public static String relation_paraphrases_path; + public static String predicate_id_path; public static String dbpedia_dbo_predicate; public HashMap predicate_2_id = null; @@ -41,24 +39,14 @@ public class ParaphraseDictionary { public int paraphrasedPredCount = 0; public int lineCount = 0; - /** - * constructor - * @param parser - * @param ner - */ public ParaphraseDictionary () { - String fixedPath = Globals.localPath; + String fixedPath = Globals.localPath+"data/pkubase/"; System.out.println(System.getProperty("user.dir")); - localDataPath = fixedPath + "data/DBpedia2016/parapharse/"; - dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt"; - dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt"; - - dbpedia_predicate_id = localDataPath + "16predicate_id.txt"; - dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt"; + relation_paraphrases_path = fixedPath + "paraphrase/pkubase-paraphrase.txt"; + predicate_id_path = fixedPath + "fragments/id_mappings/pkubase_predicate_id.txt"; bannedTypes = new HashSet(); - bannedTypes.add("Mayor"); relns_subject = new HashSet(); relns_subject.add("subj"); @@ -76,25 +64,16 @@ public class ParaphraseDictionary { relns_object.add("obj"); relns_object.add("pobj"); - prepositions = new HashSet(); - prepositions.add("in");//in at on with to from before after of for - prepositions.add("at"); - prepositions.add("on"); - prepositions.add("with"); - prepositions.add("to"); - prepositions.add("from"); - prepositions.add("before"); - prepositions.add("after"); - prepositions.add("of"); - prepositions.add("for"); - prepositions.add("as"); + prepositions = new HashSet(); //TODO: safe delete try { loadPredicateId(); - loadDboPredicate(); - loadParaDict(); + addPredicateAsNLPattern(); + addHandwriteAsNLPattern(); +// loadDboPredicate(); +// loadParaDict(); buildInvertedIndex(); - typePredicateID = predicate_2_id.get("type"); + typePredicateID = predicate_2_id.get("类型"); } catch (Exception e) { e.printStackTrace(); } @@ -108,8 +87,7 @@ public class ParaphraseDictionary { predicate_2_id = new HashMap(); id_2_predicate = new HashMap(); - String input_filename = dbpedia_predicate_id; - File file = new File(input_filename); + File file = new File(predicate_id_path); InputStreamReader in = null; BufferedReader br = null; try{ @@ -118,6 +96,8 @@ public class ParaphraseDictionary { String line = null; while ((line = br.readLine())!= null) { String[] lines = line.split("\t"); + if(lines[0].startsWith("<") && lines[0].endsWith(">")) + lines[0] = lines[0].substring(1, lines[0].length()-1); predicate_2_id.put(lines[0], Integer.parseInt(lines[1])); id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]); } @@ -192,13 +172,10 @@ public class ParaphraseDictionary { InputStreamReader in = null; BufferedReader br = null; try{ - String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank; - File file = new File(inputFileName); - in = new InputStreamReader(new FileInputStream(file), "utf-8"); + in = new InputStreamReader(new FileInputStream(new File(relation_paraphrases_path)), "utf-8"); br = new BufferedReader(in); String line = null; int lineCount = 0; - //line = br.readLine();//read the first line which indicates the format while ((line = br.readLine()) != null) { if (line.startsWith("#")) continue; @@ -259,72 +236,23 @@ public class ParaphraseDictionary { * A set of very important NL patterns are the predicates themselves! */ public void addPredicateAsNLPattern () { + if(nlPattern_2_predicateList == null) + nlPattern_2_predicateList = new HashMap>(); + final int support = 200; int predicate_id; for (String p : predicate_2_id.keySet()) { - // TODO: Omitting some bad relations (should be discarded in future) - if(p.equals("state") || p.equals("states")) - continue; - predicate_id = predicate_2_id.get(p); - StringBuilder pattern = new StringBuilder(""); - - // Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern - if(p.contains("/")) + + // TODO: segmentation: 1) tokenize 2) single ch-word + String patternString = ""; + List q=EntityRecognitionCh.segmenter.process(p, SegMode.SEARCH); + for (SegToken t:q) { - if(p.charAt(0)>='A' && p.charAt(0)<='Z') - p = p.substring(p.indexOf("/")+1); - //gameW/l 1974 - else - p = p.replace("/", ""); - } - - int last = 0, i = 0; - for(i = 0; i < p.length(); i ++) { - // if it were not a small letter, then break it. - if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) { - pattern.append(p.substring(last, i).toLowerCase()); - pattern.append(" "); - last = i; - } + patternString += t.word + " "; } - pattern.append(p.substring(last, i).toLowerCase()); - for (i = 3; i < pattern.length(); i ++) { - // the blank between two digits should be deleted. - if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9' - && pattern.charAt(i-1)==' ' - && pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') { - pattern.deleteCharAt(i-1); - } - // the blank between I and D should be deleted. - else if (pattern.charAt(i)=='d' - && pattern.charAt(i-1)==' ' - && pattern.charAt(i-2)=='i' - && pattern.charAt(i-3)==' ') { - pattern.deleteCharAt(i-1); - } - // the blank between D and B should be deleted. - else if (pattern.charAt(i)=='b' - && pattern.charAt(i-1)==' ' - && pattern.charAt(i-2)=='d' - && pattern.charAt(i-3)==' ') { - pattern.deleteCharAt(i-1); - } - } - - // pattern -> base form - /*String[] ptns = pattern.toString().split(" "); - pattern = new StringBuilder(""); - for (String s : ptns) { - pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s)); - pattern.append(" "); - } - pattern.deleteCharAt(pattern.length()-1); - String patternString = pattern.toString();*/ - - // Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm - String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString()); + patternString = patternString.trim(); //System.out.println(p + "-->" + patternString); if (!nlPattern_2_predicateList.containsKey(patternString)) { @@ -340,30 +268,39 @@ public class ParaphraseDictionary { } public void addHandwriteAsNLPattern() throws IOException { - String inputFileName = dbpedia_relation_paraphrases_handwrite; InputStreamReader in = null; BufferedReader br = null; try{ - File file = new File(inputFileName); - in = new InputStreamReader(new FileInputStream(file), "utf-8"); + in = new InputStreamReader(new FileInputStream(new File(relation_paraphrases_path)), "utf-8"); br = new BufferedReader(in); String line = null; - //int lineCount = 0; - //line = br.readLine();//read the first line which indicates the format while ((line = br.readLine()) != null) { if (line.startsWith("#") || line.isEmpty()) continue; - //lineCount ++; + String[] content = line.split("\t"); if(!predicate_2_id.containsKey(content[0])) continue; int predicateID = predicate_2_id.get(content[0]); - String nlPattern = content[1].toLowerCase(); + String nlPattern = content[1]; int support = Integer.parseInt(content[2]); + // Need Segmentation + if(!nlPattern.contains(" ")) + { + String patternString = ""; + List q=EntityRecognitionCh.segmenter.process(nlPattern, SegMode.SEARCH); + for (SegToken t:q) + { + patternString += t.word + " "; + } + patternString = patternString.trim(); + nlPattern = patternString; + } + if (!nlPattern_2_predicateList.containsKey(nlPattern)) { nlPattern_2_predicateList.put(nlPattern, new ArrayList()); } @@ -434,7 +371,7 @@ public class ParaphraseDictionary { } public static void main (String[] args) { - Globals.coreNLP = new CoreNLP(); +// Globals.coreNLP = new CoreNLP(); Globals.pd = new ParaphraseDictionary(); //Globals.pd.showNLPatterns(); } diff --git a/src/qa/GAnswer.java b/src/qa/GAnswer.java index 000be6e..c89e59c 100644 --- a/src/qa/GAnswer.java +++ b/src/qa/GAnswer.java @@ -32,8 +32,8 @@ public class GAnswer { QueryLogger qlog = null; try { - if (input.length() <= 5) - return null; +// if (input.length() <= 5) +// return null; System.out.println("[Input:] "+input); @@ -47,17 +47,17 @@ public class GAnswer { // Try to solve each NR plan, and combine the ranked SPARQLs. // We only reserve LOG of BEST NR plan for convenience. + // Now only 1 plan for(int i=query.sList.size()-1; i>=0; i--) { Sentence possibleSentence = query.sList.get(i); qlog.reloadSentence(possibleSentence); -// qlog.isMaltParserUsed = true; // LOG System.out.println("transQ: "+qlog.s.plainText); - qlog.NRlog = query.preLog; +// qlog.NRlog = query.preLog; qlog.SQGlog = "Id: "+query.queryId+"\nQuery: "+query.NLQuestion+"\n"; - qlog.SQGlog += qlog.NRlog; +// qlog.SQGlog += qlog.NRlog; qlog.timeTable.put("step0", (int)NRtime); // step 1: question parsing (dependency tree, sentence type) @@ -91,7 +91,7 @@ public class GAnswer { qlog.rankedSparqls = rankedSparqls; System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size()); - // Detect question focus. + // Detect question focus. TODO: in which cases the question focus != target? for (int i=0; i inputList = FileUtil.readFile("E:/Linyinnian/qald6_special.txt"); + List inputList = FileUtil.readFile("data/test/mini-ccks.txt"); for(String input: inputList) { + if (input.length()<2 || input.charAt(0)!='q') continue; + System.out.println("----------------------------------------"); + System.out.println(input); + ArrayList outputs = new ArrayList(); ArrayList spqs = new ArrayList(); spqs.add("id:"+String.valueOf(i)); @@ -220,9 +224,9 @@ public class GAnswer { System.out.println("Ranked Sparqls: " + qlog.rankedSparqls.size()); outputs.add(qlog.SQGlog); - outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms"); - outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms"); - outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); +// outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms"); +// outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms"); +// outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); long excuting_st_time = System.currentTimeMillis(); Matches m = null; @@ -274,8 +278,10 @@ public class GAnswer { outputs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); } } + else + outputs.add(""); - FileUtil.writeFile(outputs, "E:/Linyinnian/qald6_special_out.txt", true); + FileUtil.writeFile(outputs, "data/test/mini-ccks.out", true); } } diff --git a/src/qa/Globals.java b/src/qa/Globals.java index fd2a9b3..413b04d 100644 --- a/src/qa/Globals.java +++ b/src/qa/Globals.java @@ -8,26 +8,18 @@ import lcn.EntityFragmentFields; import fgmt.RelationFragment; import fgmt.TypeFragment; import paradict.ParaphraseDictionary; -import qa.mapping.DBpediaLookup; -import nlp.tool.NERecognizer; -import nlp.tool.CoreNLP; -import nlp.tool.MaltParser; import nlp.tool.StanfordParser; import nlp.tool.StopWordsList; public class Globals { // nlp tools - public static CoreNLP coreNLP; public static StanfordParser stanfordParser; public static StopWordsList stopWordsList; - public static MaltParser maltParser; - public static NERecognizer nerRecognizer; // relation paraphrase dictionary public static ParaphraseDictionary pd; // entity linking system - public static DBpediaLookup dblk; public static int MaxAnswerNum = 100; - public static String Dataset = "dbpedia 2016"; + public static String Dataset = "pkubase"; public static String Version = "0.1.2"; public static String GDBsystem = "gStore v0.7.2"; @@ -39,34 +31,25 @@ public class Globals { public static int evaluationMethod = 2; public static String localPath = "./././"; - public static String QueryEngineIP = "dbpedia16.gstore-pku.com"; // Notice, PORT number is in the evaluation function. + public static String QueryEngineIP = "pkubase.gstore-pku.com"; // Notice, PORT number is in the evaluation function. public static int QueryEnginePort = 80; public static void init () { - System.out.println("====== gAnswer2.0 over DBpedia ======"); + System.out.println("====== gAnswer2.0 over Pkubase ======"); long t1, t2, t3, t4, t5, t6, t7, t8, t9; t1 = System.currentTimeMillis(); - coreNLP = new CoreNLP(); - - t2 = System.currentTimeMillis(); stanfordParser = new StanfordParser(); - t3 = System.currentTimeMillis(); - maltParser = new MaltParser(); - - t4 = System.currentTimeMillis(); - nerRecognizer = new NERecognizer(); - - t5 = System.currentTimeMillis(); + t2 = System.currentTimeMillis(); stopWordsList = new StopWordsList(); - t6 = System.currentTimeMillis(); + t3 = System.currentTimeMillis(); pd = new ParaphraseDictionary(); - t7 = System.currentTimeMillis(); + t4 = System.currentTimeMillis(); try { EntityFragmentFields.load(); @@ -78,20 +61,13 @@ public class Globals { e1.printStackTrace(); } - t8 = System.currentTimeMillis(); - dblk = new DBpediaLookup(); - - t9 = System.currentTimeMillis(); + t5 = System.currentTimeMillis(); System.out.println("======Initialization======"); - System.out.println("CoreNLP(Lemma): " + (t2-t1) + "ms."); - System.out.println("StanfordParser: " + (t3-t2) + "ms."); - System.out.println("MaltParser: " + (t4-t3) + "ms."); - System.out.println("NERecognizer: " + (t5-t4) + "ms."); - System.out.println("StopWordsList: " + (t6-t5) + "ms."); - System.out.println("ParaphraseDict & posTagPattern: " + (t7-t6) + "ms."); - System.out.println("GraphFragments: " + (t8-t7) + "ms."); - System.out.println("DBpediaLookup: " + (t9-t8) + "ms."); - System.out.println("* Total *: " + (t9-t1) + "ms."); + System.out.println("StanfordParser: " + (t2-t1) + "ms."); + System.out.println("StopWordsList: " + (t3-t2) + "ms."); + System.out.println("ParaphraseDict: " + (t4-t3) + "ms."); + System.out.println("GraphFragments: " + (t5-t4) + "ms."); + System.out.println("* Total *: " + (t5-t1) + "ms."); System.out.println("=========================="); } diff --git a/src/qa/Query.java b/src/qa/Query.java index 6ebada7..82cc506 100644 --- a/src/qa/Query.java +++ b/src/qa/Query.java @@ -1,10 +1,11 @@ package qa; import java.util.ArrayList; +import java.util.List; import nlp.ds.Sentence; -import qa.extract.EntityRecognition; -import rdf.MergedWord; +import nlp.ds.Word; +import qa.extract.EntityRecognitionCh; /** * 1. preprocessing of question @@ -21,7 +22,7 @@ public class Query public String queryId = null; public String preLog = ""; - public ArrayList mWordList = null; + public List words = null; public Query(){} public Query(String _question) @@ -32,15 +33,17 @@ public class Query TransferedQuestion = getTransferedQuestion(NLQuestion); // step1. NODE Recognition - MergedQuestionList = getMergedQuestionList(TransferedQuestion); +// MergedQuestionList = getMergedQuestionList(TransferedQuestion); + words = EntityRecognitionCh.parseSentAndRecogEnt(TransferedQuestion); // build Sentence sList = new ArrayList(); - for(String mergedQuestion: MergedQuestionList) - { - Sentence sentence = new Sentence(this, mergedQuestion); - sList.add(sentence); - } + sList.add(new Sentence(words, TransferedQuestion)); // TODO: TransferedQuestion or _question +// for(String mergedQuestion: MergedQuestionList) +// { +// Sentence sentence = new Sentence(this, mergedQuestion); +// sList.add(sentence); +// } } public boolean isDigit(char ch) @@ -66,6 +69,14 @@ public class Query */ public String getTransferedQuestion(String question) { + //discard ? ! . + if(question.endsWith("?") || question.endsWith("。") || question.endsWith("!")) + question = question.substring(0, question.length()-1); + + //discard 《》 because stanford parser DO NOT recognize them. TODO: why? + question = question.replace("《", "").replace("》", ""); + question = question.replace("“", "").replace("”", ""); // now just discard "" because they confuse the parser. + //rule1: discard ".", because "." and "_" will be disconnected by parser. Discard word tail's "'", which may pollutes NER question = question.replace("' ", " "); String [] words = question.split(" "); @@ -84,45 +95,31 @@ public class Query ret = ret.substring(0,ret.length()-1); ret = ret.replace("-", " "); - ret = ret.replace("in america", "in United States"); - - //rule2: as well as -> and - ret = ret.replace("as well as", "and"); - - //rule3: movie -> film - ret = ret.replace(" movie", " film"); - ret = ret.replace(" movies", " films"); + return ret; } - /** - * Recognize entity & type & literal in KB and replace " " in Phrases with "_" - * @param question - * @return merged question list - */ - public ArrayList getMergedQuestionList(String question) - { - ArrayList mergedQuestionList = null; - //entity & type recognize - EntityRecognition er = new EntityRecognition(); - mergedQuestionList = er.process(question); - preLog = er.preLog; - mWordList = er.mWordList; - - return mergedQuestionList; - } - public String removeQueryId(String question) { String ret = question; + // case 1: 1\t int st = question.indexOf("\t"); - if(st!=-1 && question.length()>1 && question.charAt(0)>='0' && question.charAt(0)<='9') + if(st!=-1 && question.length()>4 && isDigit(question.charAt(0))) { queryId = question.substring(0,st); ret = question.substring(st+1); System.out.println("Extract QueryId :"+queryId); } + // case 2: q1: | 1: + st = question.indexOf(":"); + if(st!=-1 && st<6 && question.length()>4 && (isDigit(question.charAt(0)) ||question.startsWith("q"))) + { + queryId = question.substring(0,st).replace("q", ""); + ret = question.substring(st+1); + System.out.println("Extract QueryId :"+queryId); + } + return ret; } } diff --git a/src/qa/extract/EntityRecognition.java b/src/qa/extract/EntityRecognition.java deleted file mode 100644 index bad4ac9..0000000 --- a/src/qa/extract/EntityRecognition.java +++ /dev/null @@ -1,864 +0,0 @@ -package qa.extract; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; - -import fgmt.EntityFragment; -import nlp.ds.Word; -import qa.Globals; -import rdf.EntityMapping; -import rdf.NodeSelectedWithScore; -import rdf.TypeMapping; -import rdf.MergedWord; -import utils.FileUtil; -import addition.*; - -/** - * Core class of Node Recognition - * @author husen - */ -public class EntityRecognition { - public String preLog = ""; - public String stopEntFilePath = Globals.localPath + "data/DBpedia2016/parapharse/stopEntDict.txt"; - - double EntAcceptedScore = 26; - double TypeAcceptedScore = 0.5; - double AcceptedDiffScore = 1; - - public ArrayList mWordList = null; - public ArrayList stopEntList = null; - public ArrayList badTagListForEntAndType = null; - ArrayList> selectedList = null; - - TypeRecognition tr = null; - AddtionalFix af = null; - - public EntityRecognition() - { - // LOG - preLog = ""; - loadStopEntityDict(); - - // Bad posTag for entity - badTagListForEntAndType = new ArrayList(); - badTagListForEntAndType.add("RBS"); - badTagListForEntAndType.add("JJS"); - badTagListForEntAndType.add("W"); - badTagListForEntAndType.add("."); - badTagListForEntAndType.add("VBD"); - badTagListForEntAndType.add("VBN"); - badTagListForEntAndType.add("VBZ"); - badTagListForEntAndType.add("VBP"); - badTagListForEntAndType.add("POS"); - - // Additional fix for CATEGORY (in DBpedia) - af = new AddtionalFix(); - tr = new TypeRecognition(); - - System.out.println("EntityRecognizer Initial : ok!"); - } - - public void loadStopEntityDict() - { - stopEntList = new ArrayList(); - try - { - List inputs = FileUtil.readFile(stopEntFilePath); - for(String line: inputs) - { - if(line.startsWith("#")) - continue; - stopEntList.add(line); - } - } - catch (Exception e) { - e.printStackTrace(); - } - } - - public ArrayList process(String question) - { - ArrayList fixedQuestionList = new ArrayList(); - ArrayList literalList = new ArrayList(); - HashMap entityScores = new HashMap(); - HashMap entityMappings = new HashMap(); - HashMap typeScores = new HashMap(); - HashMap typeMappings = new HashMap(); - HashMap mappingScores = new HashMap(); - ArrayList mustSelectedList = new ArrayList(); - - System.out.println("--------- entity/type recognition start ---------"); - - Word[] words = Globals.coreNLP.getTaggedWords(question); - mWordList = new ArrayList(); - - long t1 = System.currentTimeMillis(); - int checkEntCnt = 0, checkTypeCnt = 0, hitEntCnt = 0, hitTypeCnt = 0, allCnt = 0; - boolean needRemoveCommas = false; - - // Check entity & type - // Notice, ascending order by length - StringBuilder tmpOW = new StringBuilder(); - StringBuilder tmpBW = new StringBuilder(); - for(int len=1; len<=words.length; len++) - { - for(int st=0,ed=st+len; ed<=words.length; st++,ed++) - { - String originalWord = "", baseWord = "", allUpperWord = ""; - //String[] posTagArr = new String[len]; - for(int j=st; j0 && tmp.charAt(0) >='a' && tmp.charAt(0)<='z') - { - String pre = tmp.substring(0,1).toUpperCase(); - tmp = pre + tmp.substring(1); - } - allUpperWord += tmp; - - if(j < ed-1) - { - //originalWord += "_"; - //baseWord += "_"; - tmpOW.append("_"); - tmpBW.append("_"); - } - } - originalWord = tmpOW.toString(); - baseWord=tmpBW.toString(); - tmpOW.setLength(0); - tmpBW.setLength(0); - - allCnt++; -/* - * Filters to speed up and drop some bad cases. -*/ - boolean entOmit = false, typeOmit = false; - int prep_cnt=0; - - // Upper words can pass filter. eg: "Melbourne , Florida" - int UpperWordCnt = 0; - for(int i=st;i='A' && words[i].originalForm.charAt(0)<='Z') - || ((words[i].posTag.equals(",") || words[i].originalForm.equals("'")) && i>st && i0) - { - Word formerWord = words[st-1]; - //as princess - if(formerWord.baseForm.equals("as")) - entOmit = true; - //how many dogs? - if(formerWord.baseForm.equals("many")) - entOmit = true; - - //obama's daughter ; your height | len=1 to avoid: Asimov's Foundation series - if(len == 1 && (formerWord.posTag.startsWith("POS") || formerWord.posTag.startsWith("PRP"))) - entOmit = true; - //the father of you - if(ed='A' && nextWord.originalForm.charAt(0)<='Z') - entOmit = true; - } - - for(int i=st;i= 3) - { - entOmit = true; - typeOmit = true; - } - } -/* - * Filter done. -*/ - - // Search category | highest priority - String category = null; - if(af.pattern2category.containsKey(baseWord)) - { - typeOmit = true; - entOmit = true; - category = af.pattern2category.get(baseWord); - } - - // Search type - int hitMethod = 0; // 1=dbo(baseWord), 2=dbo(originalWord), 3=yago|extend() - ArrayList tmList = new ArrayList(); - if(!typeOmit) - { - System.out.println("Type Check: "+originalWord); - //checkTypeCnt++; - //search standard type - tmList = tr.getTypeIDsAndNamesByStr(baseWord); - if(tmList == null || tmList.size() == 0) - { - tmList = tr.getTypeIDsAndNamesByStr(originalWord); - if(tmList != null && tmList.size()>0) - hitMethod = 2; - } - else - hitMethod = 1; - - //Search extend type (YAGO type) - if(tmList == null || tmList.size() == 0) - { - tmList = tr.getExtendTypeByStr(allUpperWord); - if(tmList != null && tmList.size() > 0) - { - preLog += "++++ Extend Type detect: "+baseWord+": "+" prefferd relaiton:"+tmList.get(0).prefferdRelation+"\n"; - hitMethod = 3; - } - } - } - - // Search entity - ArrayList emList = new ArrayList(); - if(!entOmit && !stopEntList.contains(baseWord)) - { - System.out.println("Ent Check: "+originalWord); - checkEntCnt++; - // Notice, the second parameter is whether use DBpedia Lookup. - emList = getEntityIDsAndNamesByStr(originalWord, (UpperWordCnt>=len-1 || len==1),len); - if(emList == null || emList.size() == 0) - { - emList = getEntityIDsAndNamesByStr(baseWord, (UpperWordCnt>=len-1 || len==1), len); - } - if(emList!=null && emList.size()>10) - { - ArrayList tmpList = new ArrayList(); - for(int i=0;i<10;i++) - { - tmpList.add(emList.get(i)); - } - emList = tmpList; - } - } - - MergedWord mWord = new MergedWord(st,ed,originalWord); - - // Add category - if(category != null) - { - mWord.mayCategory = true; - mWord.category = category; - int key = st*(words.length+1) + ed; - mustSelectedList.add(key); - } - - // Add literal - if(len==1 && checkLiteralWord(words[st])) - { - mWord.mayLiteral = true; - int key = st*(words.length+1) + ed; - literalList.add(key); - } - - // Add type mappings - if(tmList!=null && tmList.size()>0) - { - // Drop by score threshold - if(tmList.get(0).score < TypeAcceptedScore) - typeOmit = true; - - // Only allow EXACT MATCH when method=1|2 - // TODO: consider approximate match and taxonomy. eg, actor->person - String likelyType = tmList.get(0).typeName.toLowerCase(); - String candidateBase = baseWord.replace("_", ""), candidateOriginal = originalWord.replace("_", "").toLowerCase(); - if(!candidateBase.equals(likelyType) && hitMethod == 1) - typeOmit = true; - if(!candidateOriginal.equals(likelyType) && hitMethod == 2) - typeOmit = true; - - if(!typeOmit) - { - mWord.mayType = true; - mWord.tmList = tmList; - - int key = st*(words.length+1) + ed; - typeMappings.put(key, tmList.get(0).typeName); - typeScores.put(key, tmList.get(0).score); - } - } - - // Add entity mappings - if(emList!=null && emList.size()>0) - { - // Drop by score threshold - if(emList.get(0).score < EntAcceptedScore) - entOmit = true; - - // Drop: the [German Shepherd] dog - else if(len > 2) - { - for(int key: entityMappings.keySet()) - { - //int te=key%(words.length+1); - int ts=key/(words.length+1); - if(ts == st+1 && ts <= ed) - { - //DT in lowercase (allow uppercase, such as: [The Pillars of the Earth]) - if(words[st].posTag.startsWith("DT") && !(words[st].originalForm.charAt(0)>='A'&&words[st].originalForm.charAt(0)<='Z')) - { - entOmit = true; - } - } - } - } - - // Record info in merged word - if(!entOmit) - { - mWord.mayEnt = true; - mWord.emList = emList; - - // use to remove duplicate and select - int key = st*(words.length+1) + ed; - entityMappings.put(key, emList.get(0).entityID); - - // fix entity score | conflict resolution - double score = emList.get(0).score; - String likelyEnt = emList.get(0).entityName.toLowerCase().replace(" ", "_"); - String lowerOriginalWord = originalWord.toLowerCase(); - // !Award: whole match - if(likelyEnt.equals(lowerOriginalWord)) - score *= len; - // !Award: COVER (eg, Robert Kennedy: [Robert] [Kennedy] [Robert Kennedy]) - //e.g, Social_Democratic_Party -> all ents -> drop the overlapped smaller ones - //e.g, Abraham_Lincoln -> select the whole word - if(len>1) - { - boolean[] flag = new boolean[words.length+1]; - ArrayList needlessEntList = new ArrayList(); - double tmpScore=0; - for(int preKey: entityMappings.keySet()) - { - if(preKey == key) - continue; - int te=preKey%(words.length+1),ts=preKey/(words.length+1); - for(int i=ts;i= te) - { - needlessEntList.add(preKey); - tmpScore += entityScores.get(preKey); - } - } - int hitCnt = 0; - for(int i=st;i 0.6 && (double)UpperWordCnt/(double)len > 0.6) || UpperWordCnt == len || len>=4) - { - boolean commaTotalRight = true; - if(originalWord.contains(",")) - { - String candidateCompactString = originalWord.replace(",","").replace("_", "").toLowerCase(); - String likelyCompactEnt = likelyEnt.replace(",","").replace("_", ""); - if(!candidateCompactString.equals(likelyCompactEnt)) - commaTotalRight = false; - else - { - mWord.name = mWord.name.replace("_,_","_"); - needRemoveCommas = true; - } - } - if(commaTotalRight) - { - mustSelectedList.add(key); - if(tmpScore>score) - score = tmpScore+1; - for(int preKey: needlessEntList) - { - entityMappings.remove(preKey); - mustSelectedList.remove(Integer.valueOf(preKey)); - } - } - } - } - //NOTICE: score in mWord have no changes. we only change the score in entityScores. - entityScores.put(key,score); - } - } - - if(mWord.mayCategory || mWord.mayEnt || mWord.mayType || mWord.mayLiteral) - mWordList.add(mWord); - } - } - - /* Print all candidates (use fixed score).*/ - System.out.println("------- Result ------"); - for(MergedWord mWord: mWordList) - { - int key = mWord.st * (words.length+1) + mWord.ed; - if(mWord.mayCategory) - { - System.out.println("Detect category mapping: "+mWord.name+": "+ mWord.category +" score: 100.0"); - preLog += "++++ Category detect: "+mWord.name+": "+mWord.category+" score: 100.0\n"; - } - if(mWord.mayEnt) - { - System.out.println("Detect entity mapping: "+mWord.name+": ["); - for(EntityMapping em: mWord.emList) - System.out.print(em.entityName + ", "); - System.out.println("]"); - preLog += "++++ Entity detect: "+mWord.name+": "+mWord.emList.get(0).entityName+" score:"+entityScores.get(key)+"\n"; - hitEntCnt++; - } - if(mWord.mayType) - { - System.out.println("Detect type mapping: "+mWord.name+": ["); - for(TypeMapping tm: mWord.tmList) - System.out.print(tm.typeName + ", "); - System.out.println("]"); - preLog += "++++ Type detect: "+mWord.name+": "+mWord.tmList.get(0).typeName +" score:"+typeScores.get(key)+"\n"; - hitTypeCnt++; - } - if(mWord.mayLiteral) - { - System.out.println("Detect literal: "+mWord.name); - preLog += "++++ Literal detect: "+mWord.name+"\n"; - } - } - - /* - * Sort by score and remove duplicate. - * eg, <"video_game" "ent:Video game" "50.0"> <"a_video_game" "ent:Video game" "45.0">. - * Notice, reserve all information in mWordList. - */ - // one ENT maps different mergedWord in query, reserve the higher score. - ByValueComparator bvc = new ByValueComparator(entityScores,words.length+1); - List keys = new ArrayList(entityMappings.keySet()); - Collections.sort(keys, bvc); - for(Integer key : keys) - { - if(!mappingScores.containsKey(entityMappings.get(key))) - mappingScores.put(entityMappings.get(key), entityScores.get(key)); - else - entityMappings.remove(key); - } - - selectedList = new ArrayList>(); - ArrayList selected = new ArrayList(); - - // Some phrases must be selected. - selected.addAll(mustSelectedList); - for(Integer key: typeMappings.keySet()) - { - // !type(len>1) (Omit len=1 because: [Brooklyn Bridge] is a entity. - int ed = key%(words.length+1), st = key/(words.length+1); - if(st+1 < ed) - { - boolean beCovered = false; - //Entity cover type, eg:[prime_minister of Spain] - for(int preKey: entityMappings.keySet()) - { - int te=preKey%(words.length+1),ts=preKey/(words.length+1); - //Entiy should longer than type - if(ts <= st && te >= ed && ed-st < te-ts) - { - beCovered = true; - } - } - - if(!beCovered) - selected.add(key); - } - } - - // Conflict resolution - ArrayList noConflictSelected = new ArrayList(); - - //select longer one when conflict - boolean[] flag = new boolean[words.length]; - ByLenComparator blc = new ByLenComparator(words.length+1); - Collections.sort(selected,blc); - - for(Integer key : selected) - { - int ed = key%(words.length+1), st = (key-ed)/(words.length+1); - boolean omit = false; - for(int i=st;i top-k decision - dfs(keys,0,noConflictSelected,words.length+1); - ArrayList nodeSelectedWithScoreList = new ArrayList(); - for(ArrayList select: selectedList) - { - double score = 0; - for(Integer key: select) - { - if(entityScores.containsKey(key)) - score += entityScores.get(key); - if(typeScores.containsKey(key)) - score += typeScores.get(key); - } - NodeSelectedWithScore tmp = new NodeSelectedWithScore(select, score); - nodeSelectedWithScoreList.add(tmp); - } - Collections.sort(nodeSelectedWithScoreList); - - // Replace - int cnt = 0; - for(int k=0; k= nodeSelectedWithScoreList.size()) - break; - selected = nodeSelectedWithScoreList.get(k).selected; - - Collections.sort(selected); - int j = 0; - String res = question; - if(selected.size()>0) - { - res = words[0].originalForm; - int tmp = selected.get(j++), st = tmp/(words.length+1), ed = tmp%(words.length+1); - for(int i=1;ist && i= ed && j= 3) // top-3 - break; - } - long t2 = System.currentTimeMillis(); -// preLog += "Total hit/check/all ent num: "+hitEntCnt+" / "+checkEntCnt+" / "+allCnt+"\n"; -// preLog += "Total hit/check/all type num: "+hitTypeCnt+" / "+checkTypeCnt+" / "+allCnt+"\n"; - preLog += "Node Recognition time: "+ (t2-t1) + "ms\n"; - System.out.println("Total check time: "+ (t2-t1) + "ms"); - System.out.println("--------- pre entity/type recognition end ---------"); - - return fixedQuestionList; - } - - public void dfs(List keys,int dep,ArrayList selected,int size) - { - if(dep == keys.size()) - { - ArrayList tmpList = (ArrayList) selected.clone(); - selectedList.add(tmpList); - } - else - { - //off: dep-th mWord - dfs(keys,dep+1,selected,size); - //on: no conflict - boolean conflict = false; - for(int preKey: selected) - { - int curKey = keys.get(dep); - int preEd = preKey%size, preSt = (preKey-preEd)/size; - int curEd = curKey%size, curSt = (curKey-curEd)/size; - if(!(preSt getEntityIDsAndNamesByStr(String entity, boolean useDblk, int len) - { - String n = entity; - ArrayList ret= new ArrayList(); - - //1. Lucene index - ret.addAll(EntityFragment.getEntityMappingList(n)); - - //2. DBpedia Lookup (some cases) - if (useDblk) - { - ret.addAll(Globals.dblk.getEntityMappings(n, null)); - } - - Collections.sort(ret); - - if (ret.size() > 0) return ret; - else return null; - } - - public int preferDBpediaLookupOrLucene(String entityName) - { - int cntUpperCase = 0; - int cntSpace = 0; - int cntPoint = 0; - int length = entityName.length(); - for (int i=0; i='A' && c<='Z') - cntUpperCase++; - } - - if ((cntUpperCase>0 || cntPoint>0) && cntSpace<3) - return 1; - if (cntUpperCase == length) - return 1; - return 0; - } - - static class ByValueComparator implements Comparator { - HashMap base_map; - int base_size; - double eps = 1e-8; - - int dblcmp(double a,double b) - { - if(a+eps < b) - return -1; - return b+eps base_map, Integer size) { - this.base_map = base_map; - this.base_size = size; - } - - public int compare(Integer arg0, Integer arg1) { - if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) { - return 0; - } - - if (dblcmp(base_map.get(arg0),base_map.get(arg1))<0) { - return 1; - } - else if (dblcmp(base_map.get(arg0),base_map.get(arg1))==0) - { - int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; - if (len0 < len1) { - return 1; - } else if (len0 == len1) { - return 0; - } else { - return -1; - } - } - else { - return -1; - } - } - } - - static class ByLenComparator implements Comparator { - int base_size; - - public ByLenComparator(int size) { - this.base_size = size; - } - - public int compare(Integer arg0, Integer arg1) { - int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; - if (len0 < len1) { - return 1; - } else if (len0 == len1) { - return 0; - } else { - return -1; - } - } - } - - public boolean isDigit(char ch) - { - if(ch>='0' && ch<='9') - return true; - return false; - } - - //TODO: other literal words. - public boolean checkLiteralWord(Word word) - { - boolean ok = false; - if(word.posTag.equals("CD")) - ok = true; - return ok; - } - - public static void main (String[] args) - { - Globals.init(); - EntityRecognition er = new EntityRecognition(); - try - { - BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); - while (true) - { - System.out.println("Please input the question: "); - String question = br.readLine(); - - er.process(question); - } - - } catch (IOException e) { - e.printStackTrace(); - } - } - -} diff --git a/src/qa/extract/EntityRecognitionCh.java b/src/qa/extract/EntityRecognitionCh.java new file mode 100644 index 0000000..6583a8e --- /dev/null +++ b/src/qa/extract/EntityRecognitionCh.java @@ -0,0 +1,566 @@ +package qa.extract; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.io.IOException; +import java.io.BufferedReader; +import java.io.InputStreamReader; + +import lcn.EntityFragmentFields; + +import com.huaban.analysis.jieba.JiebaSegmenter; +import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; +import com.huaban.analysis.jieba.SegToken; + +import edu.stanford.nlp.util.Pair; +import fgmt.TypeFragment; +import qa.Query; +import rdf.EntityMapping; +import rdf.TypeMapping; +import nlp.ds.*; +import utils.FileUtil; + +final class MODNUM +{ + public static int prime=9999991; +} +//TODO: replace by nlp.ds.word +class Word +{ + //type:0=normal word 1=entity 2=literal(string) + String word; + int type; + int pos=0; + List entList=null; + Word(String w) + { + word=w; + type=0; + } + Word(String w,int i) + { + word=w; + type=i; + } + Word(String w,int i, int j) + { + word=w; + type=i; + pos=j; + } + Word(String w,int i, int j,List l) + { + word=w; + type=i; + pos=j; + entList=l; + } +} + +class Ent +{ + public final int mod=MODNUM.prime; + public String entity_name,mention; + public int no; + public long hashe,hashm; + public Ent(String load) + { + int indexOf9=load.indexOf(9); + if (indexOf9>=0) + { + mention=load.substring(0, indexOf9); + String tmp=load.substring(indexOf9+1); + int t9=tmp.indexOf(9); + if (t9>=0) + { + entity_name=tmp.substring(0, t9); + String numberStr=tmp.substring(t9+1); + try + { + no=Integer.valueOf(numberStr); + }catch(Exception e){no=-1;}; + } + else entity_name=tmp; + hashe=calHash(entity_name); + } + else + { + mention=load; + hashe=-1; + } + hashm=calHash(mention); + } + public long calHash(String p) + { + long x=0; + if (p==null || p.length()==0) return 0; + for (int i=0;i> entMap,nentMap; + public static JiebaSegmenter segmenter = new JiebaSegmenter(); + + public final static int MaxEnt=20; + + static + { + long t0 = System.currentTimeMillis(); + List nent = FileUtil.readFile("data/pkubase/paraphrase/ccksminutf.txt"); + List mention2ent = FileUtil.readFile("data/pkubase/paraphrase/mini-mention2ent.txt"); + + entMap=new HashMap<>(); + nentMap=new HashMap<>(); + + System.out.println("Mention2Ent size: " + mention2ent.size()); + for (String input:mention2ent) + { + Ent q=new Ent(input); + if (entMap.containsKey(q.mention)) + entMap.get(q.mention).add(q.entity_name); + else + { + List l=new ArrayList<>(); + l.add(q.entity_name); + entMap.put(q.mention, l); + } + } + // mention: NOT ent word; entity_name: frequency + for (String input:nent) + { + Ent q=new Ent(input); + if (nentMap.containsKey(q.mention)) + nentMap.get(q.mention).add(q.entity_name); + else + { + List l=new ArrayList<>(); + l.add(q.entity_name); + nentMap.put(q.mention, l); + } + } + + long t1 = System.currentTimeMillis(); + System.out.println("Read Mention2Ent used "+(t1-t0)+"ms"); + } + + public static boolean isAllNumber(String q) + { + boolean ret=true; + for (int i=0;i57) return false; + } + return ret; + } + public static String longestFirst2(String Question) + { + String ret=""; + String input=Question.replace('{',' ').replace('}',' '); + + int len=input.length(); + int[][] ex=new int[len+3][]; + Ent[][] entx=new Ent[len+3][]; + for (int i=0;i rstlist=entMap.get(searchstr); + + if (rstlist!=null && rstlist.size()>0) + { + ++pos; + ex[l][pos]=j; + entx[l][pos]=new Ent(searchstr); + } + } + ex[l][0]=pos; + } + int covered[]=new int[len+3]; + for (int l=len;l>=1;l--) + { + for (int p=1;p<=ex[l][0];p++) + { + int flag=1; + for (int k=ex[l][p];k>=ex[l][p]-l+1;k--) if (covered[k]>0) flag=0; + if (flag==1) + { + //1:占用 2:词头 4:词尾 8:其他 + int FLAG=0; + List nlist=nentMap.get(entx[l][p].mention); + if (nlist!=null && nlist.size()>0) FLAG=8; + if (isAllNumber(entx[l][p].mention)) FLAG=8; + + covered[ex[l][p]]|=4; + covered[ex[l][p]-l+1]|=2; + for (int k=ex[l][p];k>=ex[l][p]-l+1;k--) + { + covered[k]|=1|FLAG; + } + } + } + } + + for (int i=0;i① + public static String intToCircle(int i) + { + if (0>i || i>20) return null; + String ret=""; + ret=ret+(char)(9311+i); + return ret; + } + //①->1 + public static int circleToInt(String i) + { + int ret=i.charAt(0)-9311; + if (0> processedString(String s) + { + List ret=new ArrayList<>(); + String sentence = ""; + int flag=0; + String word=""; + for (int i=0;i>(sentence,ret); + } + public static String reprocess(List d, List list) + { + String ret=""; + + int used[]=new int[list.size()+1]; + int isValid[]=new int[list.size()+1]; + for (int i=0;i=1;len--) + { + for (int i=0;i4) flag=0; + if (circleToInt(list.get(j).word)>=0) flag=0; + if (used[j]==1) flag=0; + } + if (flag==0) continue; + List rstlist=entMap.get(tmp); + List nlist=nentMap.get(tmp); + if (nlist!=null && nlist.size()>0) + { + for (int j=i;j0 && (nlist==null||nlist.size()==0)) + { + for (int j=i;j0) + { + isValid[i]=pos; + for (int j=i+1;j0) + { + ret=ret+intToCircle(isValid[i]); + } + } + return ret; + } + public static String removeQueryId2(String question) + { + String ret = question; + int st = question.indexOf(":"); + if(st!=-1 && st<6 && question.length()>4 && ((question.charAt(0)>='0' && question.charAt(0)<='9') ||question.charAt(0)=='q')) + { + ret = question.substring(st+1); + } + return ret; + } + public static String thirdprocess(String sentence,List d) + { + String temp="",rets2=""; + int insyh=0; + int count=0; + List lst=new ArrayList<>(); + String syh=""; + for (int i=0;i=3) + { + String newent=""; + for (int j=i-count;j=1) + { + String rp=""; + for (int j=0;j> parse(String input, JiebaSegmenter segmenter) + { +// input=removeQueryId2(input); // Remove query id before. + String newinput=longestFirst2 (input); + + Pair> d=null,r=new Pair>(); + r.second=new ArrayList<>(); + try { + d=processedString(newinput); + } catch (Exception e) { + System.out.println(e); + } + if (d!=null) + { + //System.out.println(d.first); + + List q=segmenter.process(d.first, SegMode.SEARCH); + String secondstr=""; + for (SegToken t:q) + { + secondstr=secondstr+t.word+","; + } + //System.out.println("First process: "+secondstr); + + String finalstring=""; + String stickstr=reprocess(d.second,q); + String thirdstr=thirdprocess(stickstr,d.second); + + List q2=segmenter.process(thirdstr, SegMode.SEARCH); + for (SegToken t:q2) + { + finalstring=finalstring+t.word+","; + int p=circleToInt(""+t.word.charAt(0)); + if (p!=-1) + { + Word ds=d.second.get(p-1); + r.second.add(new Word(ds.word,ds.type,ds.pos,entMap.get(ds.word))); + } + else + { + r.second.add(new Word(t.word,0,-1)); + } + } + + System.out.println("Result: "+finalstring); + + r.first=thirdstr; + + return r; + } + else return null; + } + + public static List parseSentAndRecogEnt(String sent) + { + Pair> result = parse(sent, segmenter); + if(result == null) + return null; + + List words = new ArrayList(); + int position = 1; + for(Word ow: result.second) + { + // Note: jieba postag is deprecated, so we utilize stanford parser to get postag in later. + nlp.ds.Word word = new nlp.ds.Word(ow.word, ow.word, null, position++); + words.add(word); + if(ow.type == 1 && ow.entList != null) + { + // Now just consider TYPE there in a smiple way. + if(TypeFragment.typeShortName2IdList.containsKey(ow.word)) + { + word.mayType = true; + word.tmList.add(new TypeMapping(TypeFragment.typeShortName2IdList.get(ow.word).get(0), ow.word, 100.0)); + } + word.mayEnt = true; + word.emList = new ArrayList(); + double score = 100; + for(String ent: ow.entList) + { + if(EntityFragmentFields.entityName2Id.containsKey(ent)) + { + //TODO: consider more suitable entity score + int eid = EntityFragmentFields.entityName2Id.get(ent); +// String fstr = EntityFragmentFields.entityFragmentString.get(eid); +// System.out.println(eid+"\t"+fstr); + word.emList.add(new EntityMapping(eid, ent, score)); + score -= 10; + } + } + } + else if(ow.type == 2) + word.mayLiteral = true; + // TODO: consider TYPE + } + + return words; + } + + public static void main(String[] args) throws IOException { + + EntityFragmentFields.load(); + + List inputList = FileUtil.readFile("data/test/mini-ccks.txt"); + + for(String input: inputList) + { + if (input.length()<2 || input.charAt(0)!='q') continue; + System.out.println("----------------------------------------"); + System.out.println(input); + EntityRecognitionCh.parseSentAndRecogEnt(input); + } + + } + +} + diff --git a/src/qa/extract/ExtractImplicitRelation.java b/src/qa/extract/ExtractImplicitRelation.java index 80a4900..a8ab67d 100644 --- a/src/qa/extract/ExtractImplicitRelation.java +++ b/src/qa/extract/ExtractImplicitRelation.java @@ -19,7 +19,6 @@ import log.QueryLogger; import fgmt.EntityFragment; import fgmt.TypeFragment; import nlp.ds.Word; -import nlp.tool.CoreNLP; public class ExtractImplicitRelation { @@ -374,7 +373,7 @@ public class ExtractImplicitRelation { public static void main(String[] args) throws Exception { - Globals.coreNLP = new CoreNLP(); +// Globals.coreNLP = new CoreNLP(); Globals.pd = new ParaphraseDictionary(); try { diff --git a/src/qa/extract/ExtractRelation.java b/src/qa/extract/ExtractRelation.java index fda99ae..8a5e866 100644 --- a/src/qa/extract/ExtractRelation.java +++ b/src/qa/extract/ExtractRelation.java @@ -28,8 +28,6 @@ public class ExtractRelation { public ArrayList findRelationsBetweenTwoUnit(SemanticUnit su1, SemanticUnit su2, QueryLogger qlog) { DependencyTree T = qlog.s.dependencyTreeStanford; - if(qlog.isMaltParserUsed) - T = qlog.s.dependencyTreeMalt; DependencyTreeNode n1 = T.getNodeByIndex(su1.centerWord.position), n2 = T.getNodeByIndex(su2.centerWord.position); ArrayList shortestPath = T.getShortestNodePathBetween(n1,n2); diff --git a/src/qa/extract/TypeRecognition.java b/src/qa/extract/TypeRecognition.java index 91af418..c36573b 100644 --- a/src/qa/extract/TypeRecognition.java +++ b/src/qa/extract/TypeRecognition.java @@ -90,15 +90,7 @@ public class TypeRecognition { if(allUpperFormWord.length() > 1 && allUpperFormWord.substring(1).equals(allUpperFormWord.substring(1).toLowerCase())) return null; - //search in YAGO type - if(TypeFragment.yagoTypeList.contains(allUpperFormWord)) - { - //YAGO prefix - String typeName = "yago:"+allUpperFormWord; - TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); - tmList.add(tm); - } - else if(extendTypeMap.containsKey(allUpperFormWord)) + if(extendTypeMap.containsKey(allUpperFormWord)) { String typeName = extendTypeMap.get(allUpperFormWord); TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); @@ -251,22 +243,22 @@ public class TypeRecognition { } } // type - else if(sr.arg1Word.mayType) + else if(sr.arg1Word.mayType) //TODO: type { //rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries - if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of")) - && !words[arg1WordPos-2].posTag.startsWith("V")) - { - sr.isArg1Constant = true; - double largerScore = 1000; - if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) - largerScore = sr.predicateMappings.get(0).score * 2; - PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); - sr.predicateMappings.add(0,nPredicate); - - //constant type should be object - sr.preferredSubj = sr.arg2Word; - } +// if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of")) +// && !words[arg1WordPos-2].posTag.startsWith("V")) +// { +// sr.isArg1Constant = true; +// double largerScore = 1000; +// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) +// largerScore = sr.predicateMappings.get(0).score * 2; +// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); +// sr.predicateMappings.add(0,nPredicate); +// +// //constant type should be object +// sr.preferredSubj = sr.arg2Word; +// } } //ent: constant else if(sr.arg1Word.mayEnt) @@ -297,37 +289,37 @@ public class TypeRecognition { else if(sr.arg2Word.mayType) { //rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries - if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of")) - && !words[arg2WordPos-2].posTag.startsWith("V") ) - { - sr.isArg2Constant = true; - double largerScore = 1000; - if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) - largerScore = sr.predicateMappings.get(0).score * 2; - PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); - sr.predicateMappings.add(0,nPredicate); - - sr.preferredSubj = sr.arg1Word; - } +// if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of")) +// && !words[arg2WordPos-2].posTag.startsWith("V") ) +// { +// sr.isArg2Constant = true; +// double largerScore = 1000; +// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) +// largerScore = sr.predicateMappings.get(0).score * 2; +// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); +// sr.predicateMappings.add(0,nPredicate); +// +// sr.preferredSubj = sr.arg1Word; +// } //rule: Be ... a type? - if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a")) - { - sr.isArg2Constant = true; - double largerScore = 1000; - if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) - largerScore = sr.predicateMappings.get(0).score * 2; - PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); - sr.predicateMappings.add(0,nPredicate); - - sr.preferredSubj = sr.arg1Word; - } +// if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a")) +// { +// sr.isArg2Constant = true; +// double largerScore = 1000; +// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) +// largerScore = sr.predicateMappings.get(0).score * 2; +// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); +// sr.predicateMappings.add(0,nPredicate); +// +// sr.preferredSubj = sr.arg1Word; +// } } else if(sr.arg2Word.mayEnt) { sr.isArg2Constant = true; } - if(sr.arg1Word != sr.preferredSubj) + if(sr.arg2Word == sr.preferredSubj) sr.swapArg1Arg2(); } } diff --git a/src/qa/mapping/DBpediaLookup.java b/src/qa/mapping/DBpediaLookup.java deleted file mode 100644 index bc5225f..0000000 --- a/src/qa/mapping/DBpediaLookup.java +++ /dev/null @@ -1,163 +0,0 @@ -package qa.mapping; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.HashMap; - -import lcn.EntityFragmentFields; -import log.QueryLogger; - -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.methods.GetMethod; - -import fgmt.EntityFragment; -import rdf.EntityMapping; - -public class DBpediaLookup { - //There are two websites of the DBpediaLookup online service. - //public static final String baseURL = "http://en.wikipedia.org/w/api.php?action=opensearch&format=xml&limit=10&search="; - public static final String baseURL = "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString="; - - public HttpClient ctripHttpClient = null; - - //public static final String begin = ""; - //public static final String begin = "\n "; - public static final String end = ""; - public static final int end_length = end.length(); - - public static HashMapentMentionDict = null; // TODO: base on redirect data & wikipedia click data to build mention2ent's dictionary, now just manually - - public DBpediaLookup() - { - ctripHttpClient = new HttpClient(); - ctripHttpClient.setTimeout(3000); - - entMentionDict = new HashMap(); - entMentionDict.put("Prince_Charles", "Charles,_Prince_of_Wales"); - } - - public ArrayList getEntityMappings(String searchString, QueryLogger qlog) - { - ArrayList slist = new ArrayList(); - if(entMentionDict.containsKey(searchString)) - slist.add(entMentionDict.get(searchString)); - else - slist = lookForEntityNames(searchString, qlog); - - if (slist.size() == 0 && searchString.contains(". ")) - slist.addAll(lookForEntityNames(searchString.replaceAll(". ", "."), qlog)); - - ArrayList emlist = new ArrayList(); - - // Now string use "_" as delimiter (original) - String[] sa = searchString.split("_"); - int UpperCnt = 0; - for(String str: sa) - { - if( (str.charAt(0)>='A'&&str.charAt(0)<='Z') || (str.charAt(0)>='0'&&str.charAt(0)<='9') ) - UpperCnt ++; - } - - System.out.print("DBpediaLookup find: " + slist + ", "); - - int count = 40; - for (String s : slist) - { - //consider ABBR only when all UPPER; drop when too long edit distance - if(UpperCnt < sa.length && EntityFragment.calEditDistance(s, searchString.replace("_", ""))>searchString.length()/2) - continue; - - int eid = -1; - s = s.replace(" ", "_"); - if(EntityFragmentFields.entityName2Id.containsKey(s)) - { - eid = EntityFragmentFields.entityName2Id.get(s); - emlist.add(new EntityMapping(eid, s, count)); - count -=2 ; - } - else - { - System.out.print("Drop "+s+" because it not in Entity Dictionary. "); - } - } - System.out.println("DBpediaLookup select: " + emlist); - - return emlist; - } - - public ArrayList lookForEntityNames (String searchString, QueryLogger qlog) { - // URL transition: " " -> %20 - GetMethod getMethod = new GetMethod((baseURL+searchString).replaceAll(" ", "%20")); - ArrayList ret = new ArrayList(); - int statusCode; - - try { - statusCode = ctripHttpClient.executeMethod(getMethod); - } catch (HttpException e) { - e.printStackTrace(); - return ret; - } catch (IOException e) { - e.printStackTrace(); - return ret; - } - - if (statusCode!=200) return null; - - String response = getMethod.getResponseBodyAsString(); - if (qlog != null && qlog.MODE_debug) { - System.out.println("searchString=" + searchString); - System.out.println("statusCode=" + statusCode); - System.out.println("response=" + getMethod.getResponseBodyAsString()); - } - getMethod.releaseConnection(); - - //System.out.println(response); - - if (response == null || response.isEmpty()) - return ret; - int idx1 = response.indexOf(begin); - while (idx1 != -1) { - int idx2 = response.indexOf(end, idx1+begin_length); - String ss = response.substring(idx1+begin_length, idx2); - ret.add(ss); - //System.out.println(ss); - idx1 = response.indexOf(begin, idx2 + end_length); - } - - return ret; - } - - public static void main(String argv[]){ - - DBpediaLookup dbplook = new DBpediaLookup(); - - BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); - try { - while (true) { - System.out.println("Test DBpediaLookup."); - System.out.print("Please input the search string: "); - String searchString = br.readLine(); - try { - long t1 = System.currentTimeMillis(); - ArrayList res = dbplook.lookForEntityNames(searchString, null); - long t2 = System.currentTimeMillis(); - System.out.println(res); - System.out.println("time=" + (t2-t1) + "ms"); - } catch (Exception e) { - e.printStackTrace(); - } - } - } catch (IOException e) { - e.printStackTrace(); - } - - - return; - } -} diff --git a/src/qa/parsing/BuildQueryGraph.java b/src/qa/parsing/BuildQueryGraph.java index f4758f1..5d868fd 100644 --- a/src/qa/parsing/BuildQueryGraph.java +++ b/src/qa/parsing/BuildQueryGraph.java @@ -37,84 +37,19 @@ public class BuildQueryGraph public BuildQueryGraph() { - whList.add("what"); - whList.add("which"); - whList.add("who"); - whList.add("whom"); - whList.add("when"); - whList.add("how"); - whList.add("where"); + whList.add("什么"); + whList.add("什么时候"); + whList.add("哪些"); + whList.add("哪里"); + whList.add("谁"); // Bad words for NODE. (base form) // We will train a node recognition model to replace such heuristic rules further. - stopNodeList.add("list"); - stopNodeList.add("give"); - stopNodeList.add("show"); - stopNodeList.add("star"); - stopNodeList.add("theme"); - stopNodeList.add("world"); - stopNodeList.add("independence"); - stopNodeList.add("office"); - stopNodeList.add("year"); - stopNodeList.add("work"); - } - - public void fixStopWord(QueryLogger qlog, DependencyTree ds) - { - String qStr = qlog.s.plainText.toLowerCase(); - - //... [which] - for(int i=2;i process(QueryLogger qlog) { try @@ -135,15 +70,15 @@ public class BuildQueryGraph * 3)Coreference resolution. * */ //0) Fix stop words - fixStopWord(qlog, ds); +// fixStopWord(qlog, ds); //1) Detect Modifier/Modified //rely on sentence (rather than dependency tree) //with some ADJUSTMENT (eg, ent+noun(noType&&noEnt) -> noun.omitNode=TRUE) for(Word word: qlog.s.words) getTheModifiedWordBySentence(qlog.s, word); //Find continuous modifier - for(Word word: qlog.s.words) - getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier +// for(Word word: qlog.s.words) +// getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier for(Word word: qlog.s.words) if(word.modifiedWord == null) //Other words modify themselves. NOTICE: only can be called after detecting all modifier. word.modifiedWord = word; @@ -167,9 +102,9 @@ public class BuildQueryGraph qlog.target = target.word; // !target can NOT be entity. (except general question)| which [city] has most people? - if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.emList!=null) + // only when target.mayType=True or exist other entities. + if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.mayEnt && target.word.mayType) { - //Counter example:Give me all Seven_Wonders_of_the_Ancient_World | (in fact, it not ENT, but CATEGORY, ?x subject Seve...) target.word.mayEnt = false; target.word.emList.clear(); } @@ -241,6 +176,17 @@ public class BuildQueryGraph curSU.neighborUnitList.add(expandSU); } } + if(semanticUnitList.size() == 1 && target.word.mayEnt) + { + Word[] words = qlog.s.words; + SemanticUnit curSU = semanticUnitList.get(0); + SemanticUnit expandSU = new SemanticUnit(words[words.length-1], false); + semanticUnitList.add(expandSU); + curSU.neighborUnitList.add(expandSU); + expandSU.neighborUnitList.add(curSU); + target = ds.getNodeByIndex(words.length); + qlog.target = target.word; + } qlog.timeTable.put("BQG_structure", (int)(System.currentTimeMillis()-t)); //step2: Find relations (Notice, we regard that the coreference have been resolved now) @@ -251,7 +197,7 @@ public class BuildQueryGraph qlog.timeTable.put("BQG_relation", (int)(System.currentTimeMillis()-t)); //Prepare for item mapping - TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary +// TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary TypeRecognition.constantVariableRecognition(qlog.semanticRelations, qlog); // Constant or Variable, embedded triples //(just for display) @@ -361,7 +307,7 @@ public class BuildQueryGraph tmpRelations = new ArrayList(); //Copy relations (for 'and', 'as soon as'...) |eg, In which films did Julia_Roberts and Richard_Gere play? //TODO: judge by dependency tree | other way to supplement relations - if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals("and")) + if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals("和")) { for(SimpleRelation sr: simpleRelations) { @@ -566,6 +512,7 @@ public class BuildQueryGraph return false; } + // detect the target (question focus), also to detect some co-reference via rules. (TODO: test existing utils for co-reference resolution) public DependencyTreeNode detectTarget(DependencyTree ds, QueryLogger qlog) { visited.clear(); @@ -583,8 +530,10 @@ public class BuildQueryGraph // No Wh-Word: use the first node; NOTICE: consider MODIFIER rules. E.g, was us president Obama ..., target=obama (rather us) if(target == null) { - for(Word word: words) + //Chinese sentence: the question focus is usually in the tail. + for(int i=words.length-1; i>=0; i--) { + Word word = words[i]; Word modifiedWord = word.modifiedWord; if(modifiedWord != null && isNodeCandidate(modifiedWord)) { @@ -594,42 +543,25 @@ public class BuildQueryGraph } if(target == null) - target = ds.nodesList.get(0); - - /* Are [E|tree_frogs] a type of [E|amphibian] , type - */ - for(DependencyTreeNode dtn: target.childrenList) - { - if(dtn.word.baseForm.equals("type")) - { - dtn.word.represent = target.word; - } - } - + target = ds.nodesList.get(0); } - //where, NOTICE: wh target from NN may not pass the function isNode() - if(target.word.baseForm.equals("where")) + //where + if(target.word.baseForm.equals("哪里")) { int curPos = target.word.position - 1; - //!Where is the residence of - if(words[curPos+1].baseForm.equals("be") && words[curPos+2].posTag.equals("DT")) + //大兴安岭的[终点]是(哪里) + if(curPos-2>=0 && isNodeCandidate(words[curPos-2]) && words[curPos-1].baseForm.equals("是")) { - for(int i=curPos+4;i had the highest budget boolean ok = false; @@ -683,14 +598,14 @@ public class BuildQueryGraph } //what - else if(target.word.baseForm.equals("what")) + else if(target.word.baseForm.equals("什么")) { - //Detect:what is [the] sth1 prep. sth2? + //Detect:龙卷风的[英文名]是(什么) | 金轮国师的(什么)[武功]有十龙十象之力? //Omit: what is sth? if(target.father != null && ds.nodesList.size()>=5) { DependencyTreeNode tmp1 = target.father; - if(tmp1.word.baseForm.equals("be")) + if(tmp1.word.baseForm.equals("是")) { for(DependencyTreeNode child: tmp1.childrenList) { @@ -698,15 +613,13 @@ public class BuildQueryGraph continue; if(isNode(child)) { - //sth1 - boolean hasPrep = false; + boolean another_node = false; for(DependencyTreeNode grandson: child.childrenList) - { //prep - if(grandson.dep_father2child.equals("prep")) - hasPrep = true; - } - //Detect modifier: what is the sht1's [sth2]? | what is the largest [city]? - if(hasPrep || qlog.s.hasModifier(child.word)) + if(isNode(grandson)) + another_node = true; + + //more than 2 nodes || Detect modifier: what is the sht1's [sth2]? | what is the largest [city]? + if(another_node || qlog.s.hasModifier(child.word)) { target.word.represent = child.word; target = child; @@ -715,82 +628,84 @@ public class BuildQueryGraph } } } - //what sth || What airlines are (part) of the SkyTeam alliance? + //what sth: 什么山高于8000米 else if(isNode(tmp1)) { target.word.represent = tmp1.word; - target = tmp1; - // Coreference resolution - int curPos = target.word.position - 1; - if(curPos+3 6) - { - words[curPos+2].represent = target.word; - } - + target = tmp1; } } // by sentence - if(target.word.baseForm.equals("what")) + if(target.word.baseForm.equals("什么")) { + // 金轮国师的(什么)[武功]有十龙十象之力? int curPos = target.word.position - 1; - // what be the [node] ... ? (Notice: words.length CONTAINS symbol(?),different from nodeList) - if(words.length > 5 && words[curPos+1].baseForm.equals("be") && words[curPos+2].baseForm.equals("the") && isNodeCandidate(words[curPos+3])) + if(curPos + 1 <= words.length - 1 && isNodeCandidate(words[curPos+1])) { - target.word.represent = words[curPos+3]; - target = ds.getNodeByIndex(words[curPos+3].position); + target.word.represent = words[curPos+1]; + target = ds.getNodeByIndex(words[curPos+1].position); } } } //who - else if(target.word.baseForm.equals("who")) + else if(target.word.baseForm.equals("谁")) { - //Detect:who is/does [the] sth1 prep. sth2? || Who was the pope that founded the Vatican_Television ? | Who does the voice of Bart Simpson? + //Detect:武汉大学的现任[校长]是(谁)? 和子女一起演过电影电视剧的[演员]有(谁)? //Others: who is sth? who do sth? | target = who - //test case: Who is the daughter of Robert_Kennedy married to? - if(ds.nodesList.size()>=5) - { //who - for(DependencyTreeNode tmp1: ds.nodesList) - { - if(tmp1 != target.father && !target.childrenList.contains(tmp1)) - continue; - if(tmp1.word.baseForm.equals("be") || tmp1.word.baseForm.equals("do")) - { //is - for(DependencyTreeNode child: tmp1.childrenList) - { - if(child == target) - continue; - if(isNode(child)) - { //sth1 - boolean hasPrep = false; - for(DependencyTreeNode grandson: child.childrenList) - { //prep - if(grandson.dep_father2child.equals("prep")) - hasPrep = true; - } - //Detect modifier: who is the sht1's sth2? -// if(hasPrep || qlog.s.plainText.contains(child.word.originalForm + " 's")) // replaced by detect modifier directly - if(hasPrep || qlog.s.hasModifier(child.word)) - { - target.word.represent = child.word; - target = child; - break; - } - } - } - } - } - } + //test case: 湖上草是[谁]的(诗)? +// if(ds.nodesList.size()>=5) +// { //who +// for(DependencyTreeNode tmp1: ds.nodesList) +// { +// if(tmp1 != target.father && !target.childrenList.contains(tmp1)) +// continue; +// if(tmp1.word.baseForm.equals("be") || tmp1.word.baseForm.equals("do")) +// { //is +// for(DependencyTreeNode child: tmp1.childrenList) +// { +// if(child == target) +// continue; +// if(isNode(child)) +// { //sth1 +// boolean hasPrep = false; +// for(DependencyTreeNode grandson: child.childrenList) +// { //prep +// if(grandson.dep_father2child.equals("prep")) +// hasPrep = true; +// } +// //Detect modifier: who is the sht1's sth2?if(hasPrep || qlog.s.hasModifier(child.word)) +// { +// target.word.represent = child.word; +// target = child; +// break; +// } +// } +// } +// } +// } +// } // by sentence - if(target.word.baseForm.equals("who")) + if(target.word.baseForm.equals("谁")) { int curPos = target.word.position - 1; - // who is usually coreference when it not the first word. - if(curPos - 1 >= 0 && isNodeCandidate(words[curPos-1])) + // [Node]是(谁) + if(curPos - 2 >= 0 && isNodeCandidate(words[curPos-2])) { - target.word.represent = words[curPos-1]; - target = ds.getNodeByIndex(words[curPos-1].position); + // 谁 在末尾: 武汉大学的现任[校长]是(谁) + if(curPos == words.length - 1 && (words[curPos-1].baseForm.equals("是") || words[curPos-1].baseForm.equals("有")) ) + { + target.word.represent = words[curPos-2]; + target = ds.getNodeByIndex(words[curPos-2].position); + } + // [湖上草]是谁的(诗) + if(curPos + 2 == words.length-1 && words[curPos-1].baseForm.equals("是") + && words[curPos+1].baseForm.equals("的") && isNodeCandidate(words[curPos+2])) + { + words[curPos+2].represent = words[curPos-2]; + } } + // Do nothing: [谁]的[女儿]嫁给了王思聪 } } //how @@ -847,7 +762,7 @@ public class BuildQueryGraph /* * There are two cases of [ent]+[type]:1、Chinese company 2、De_Beer company; * For 1, chinese -> company,for 2, De_Beer <- company - * Return: True : ent -> type | False : type <- ent + * Return: True : ent -> type | False : ent <- type * */ public boolean checkModifyBetweenEntType(Word entWord, Word typeWord) { @@ -868,9 +783,9 @@ public class BuildQueryGraph * Trough sentence rather than dependency tree as the latter often incorrect * Generally a sequencial nodes always modify the last node, an exception is test case 3. So we apply recursive search method. * test case: - * 1) the highest Chinese mountain - * 2) the Chinese popular director - * 3) the De_Beers company (company[type]-> De_Beers[ent]) + * 1) 最高的中国山峰 + * 2) 中国流行歌手 + * 3) 谷歌公司 (company[type]-> De_Beers[ent]) * */ public Word getTheModifiedWordBySentence(Sentence s, Word curWord) { @@ -898,14 +813,14 @@ public class BuildQueryGraph return curWord.modifiedWord = curWord; } - //modify LEFT: ent + type(cur) : De_Beer company + //modify LEFT: ent + type(cur) : 谷歌 公司 if(preWord != null && curWord.mayType && preWord.mayEnt) //ent + type(cur) { if(!checkModifyBetweenEntType(preWord, curWord)) //De_Beer <- company, 注意此时即使type后面还连着node,也不理会了 return curWord.modifiedWord = preWord; } - //modify itself: ent(cur) + type : De_Beer company + //modify itself: ent(cur) + type : 谷歌 公司 if(nextModifiedWord != null && curWord.mayEnt && nextModifiedWord.mayType) { if(!checkModifyBetweenEntType(curWord, nextModifiedWord)) diff --git a/src/qa/parsing/QuestionParsing.java b/src/qa/parsing/QuestionParsing.java index d9b86b1..36d69a2 100644 --- a/src/qa/parsing/QuestionParsing.java +++ b/src/qa/parsing/QuestionParsing.java @@ -16,36 +16,20 @@ public class QuestionParsing { } public void getDependenciesAndNER (QueryLogger qlog) { - long t1 = System.currentTimeMillis(); try { + long t1 = System.currentTimeMillis(); + qlog.s.dependencyTreeStanford = new DependencyTree(qlog.s, Globals.stanfordParser); - }catch(Exception e){ - e.printStackTrace(); - } - - long t2 = System.currentTimeMillis(); - try{ - qlog.s.dependencyTreeMalt = new DependencyTree(qlog.s, Globals.maltParser); - }catch(Exception e){ - //if errors occur, abandon malt tree - qlog.s.dependencyTreeMalt = qlog.s.dependencyTreeStanford; - System.err.println("MALT parser error! Use stanford parser instead."); - } - - try { - long t3 = System.currentTimeMillis(); - Globals.nerRecognizer.recognize(qlog.s); - long t4 = System.currentTimeMillis(); + + long t2 = System.currentTimeMillis(); +// Globals.nerRecognizer.recognize(qlog.s); //TODO: check NER + System.out.println("====StanfordDependencies("+(t2-t1)+"ms)===="); System.out.println(qlog.s.dependencyTreeStanford); - System.out.println("====MaltDependencies("+(t3-t2)+"ms)===="); - System.out.println(qlog.s.dependencyTreeMalt); - System.out.println("====NameEntityRecognition("+(t4-t3)+"ms)===="); - qlog.s.printNERResult(); +// qlog.s.printNERResult(); qlog.timeTable.put("StanfordParser", (int)(t2-t1)); - qlog.timeTable.put("MaltParser", (int)(t3-t2)); - qlog.timeTable.put("NER", (int)(t4-t3)); + } catch (Exception e) { e.printStackTrace(); } @@ -53,8 +37,7 @@ public class QuestionParsing { public void recognizeSentenceType(QueryLogger qlog) { - boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford)|| - recognizeImperativeSentence(qlog.s.dependencyTreeMalt); + boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford); if (IsImperativeSentence) { qlog.s.sentenceType = SentenceType.ImperativeSentence; @@ -66,16 +49,14 @@ public class QuestionParsing { return; } - boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford)|| - recognizeSpecialQuestion(qlog.s.dependencyTreeMalt); + boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford); if (IsSpecialQuestion) { qlog.s.sentenceType = SentenceType.SpecialQuestion; return; } - boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford)|| - recognizeGeneralQuestion(qlog.s.dependencyTreeMalt); + boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford); if (IsGeneralQuestion) { qlog.s.sentenceType = SentenceType.GeneralQuestion; diff --git a/src/rdf/MergedWord.java b/src/rdf/MergedWord.java deleted file mode 100644 index e011088..0000000 --- a/src/rdf/MergedWord.java +++ /dev/null @@ -1,41 +0,0 @@ -package rdf; - -import java.util.ArrayList; - -import rdf.EntityMapping; -import rdf.TypeMapping; - -public class MergedWord implements Comparable -{ - //original position - public int st,ed; - //position after merge (unselected is -1) - public int mergedPos = -1; - public String name; - public boolean mayCategory = false; - public boolean mayLiteral = false; - public boolean mayEnt = false; - public boolean mayType = false; - public ArrayList emList = null; - public ArrayList tmList = null; - public String category = null; - - public MergedWord(int s,int e,String n) - { - st = s; - ed = e; - name = n; - } - - @Override - //long to short - public int compareTo(MergedWord o) - { - int lenDiff = (this.ed-this.st) - (o.ed-o.st); - - if (lenDiff > 0) return -1; - else if (lenDiff < 0) return 1; - return 0; - } - -} diff --git a/src/rdf/SimpleRelation.java b/src/rdf/SimpleRelation.java index 98a79dd..b1ab3ff 100644 --- a/src/rdf/SimpleRelation.java +++ b/src/rdf/SimpleRelation.java @@ -65,7 +65,7 @@ public class SimpleRelation { } sumSelectivity = matchingScore*sumSelectivity*pidsup.support; int pid = pidsup.predicateID; - if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; +// if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; if (!pasList.containsKey(pid)) pasList.put(pid, sumSelectivity);