This is version 0.5, although it can works for Chinese QA now, it still remains a lot of work to further improve.pkubase
@@ -20,16 +20,10 @@ public class AddtionalFix | |||
public AddtionalFix() | |||
{ | |||
// Some category mappings for DBpedia, try automatic linking methods later. | base form | |||
pattern2category.put("gangster_from_the_prohibition_era", "Prohibition-era_gangsters"); | |||
pattern2category.put("seven_wonder_of_the_ancient_world", "Seven_Wonders_of_the_Ancient_World"); | |||
pattern2category.put("three_ship_use_by_columbus", "Christopher_Columbus"); | |||
pattern2category.put("13_british_colony", "Thirteen_Colonies"); | |||
} | |||
public void process(QueryLogger qlog) | |||
{ | |||
fixCategory(qlog); | |||
oneTriple(qlog); | |||
oneNode(qlog); | |||
@@ -48,45 +42,10 @@ public class AddtionalFix | |||
spq.queryType = QueryType.Ask; | |||
} | |||
public void fixCategory(QueryLogger qlog) | |||
{ | |||
if(qlog == null || qlog.semanticUnitList == null) | |||
return; | |||
String var = null, category = null; | |||
for(SemanticUnit su: qlog.semanticUnitList) | |||
{ | |||
if(su.centerWord.mayCategory) | |||
{ | |||
var = "?"+su.centerWord.originalForm; | |||
category = su.centerWord.category; | |||
} | |||
} | |||
if(category != null && var != null) | |||
for(Sparql spq: qlog.rankedSparqls) | |||
{ | |||
boolean occured = false; | |||
for(Triple tri: spq.tripleList) | |||
{ | |||
if(tri.subject.equals(var)) | |||
{ | |||
occured = true; | |||
break; | |||
} | |||
} | |||
String oName = category; | |||
String pName = "subject"; | |||
int pid = Globals.pd.predicate_2_id.get(pName); | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, var, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||
spq.addTriple(triple); | |||
} | |||
} | |||
/* recognize one-Node query | |||
* Two cases:1、Special question|Imperative sentence 2、General question | |||
* 1-1:how many [], highest [] ... | For single variable, add constraint (aggregation) | |||
* 1-2: What is backgammon? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...) | |||
* 1-2: 谁是狄仁杰? | What is a bipolar syndrome? | Search an entity (return itself or its type/description ...) | |||
* 1-3: Give me all Seven Wonders of the Ancient World. | Notice, "Seven Wonders of the Ancient World" should be recognized as ENT before. (in fact it is CATEGORY in DBpeida) | |||
* 2-1: Are there any [castles_in_the_United_States](yago:type) | |||
* 2-2:Was Sigmund Freud married? | Lack of variable node. | |||
@@ -101,7 +60,7 @@ public class AddtionalFix | |||
Word[] words = qlog.s.words; | |||
if(qlog.s.sentenceType != SentenceType.GeneralQuestion) | |||
{ | |||
//1-1: how many [type] are there | List all [type] | |||
//1-1: 有多少[type] | 列出所有[type] | |||
if(target.mayType && target.tmList != null) | |||
{ | |||
String subName = "?"+target.originalForm; | |||
@@ -111,10 +70,10 @@ public class AddtionalFix | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
//1-2: What is [ent]? | |||
else if(target.mayEnt && target.emList != null) | |||
{ | |||
if(words.length >= 3 && words[0].baseForm.equals("what") && words[1].baseForm.equals("be")) | |||
//1-2: 什么是[ent] | |||
if(words.length >= 3 && (words[0].baseForm.equals("什么") || words[0].baseForm.equals("谁")) && words[1].baseForm.equals("是")) | |||
{ | |||
int eid = target.emList.get(0).entityID; | |||
String subName = target.emList.get(0).entityName; | |||
@@ -123,24 +82,14 @@ public class AddtionalFix | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
//1-3: Give me all Seven Wonders of the Ancient World. | |||
else if(target.mayCategory && target.category != null) | |||
{ | |||
String oName = target.category; | |||
String pName = "subject"; | |||
int pid = Globals.pd.predicate_2_id.get(pName); | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, "?"+target.originalForm, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
//1-3: [ent] with other relations | |||
} | |||
} | |||
else | |||
else | |||
{ | |||
if(target.mayEnt && target.emList != null) | |||
{ | |||
//2-2:Was Sigmund Freud married? | |||
//2-2:[ent]结婚了吗? | |||
String relMention = ""; | |||
for(Word word: words) | |||
if(word != target && !word.baseForm.equals(".") && !word.baseForm.equals("?")) | |||
@@ -162,34 +111,6 @@ public class AddtionalFix | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
//2-3:Are penguins endangered? | |||
else | |||
{ | |||
if(target.position < words.length && pattern2category.containsKey(words[target.position].baseForm)) | |||
{ | |||
String oName = pattern2category.get(words[target.position].baseForm); | |||
String pName = "subject"; | |||
int pid = Globals.pd.predicate_2_id.get(pName); | |||
int eid = target.emList.get(0).entityID; | |||
String subName = target.emList.get(0).entityName; | |||
Triple triple = new Triple(eid, subName, pid, Triple.CAT_ROLE_ID, oName, null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
} | |||
//2-1: Are there any [castles_in_the_United_States](yago:type) | |||
else if(target.mayType && target.tmList != null) | |||
{ | |||
String typeName = target.tmList.get(0).typeName; | |||
String subName = "?" + target.originalForm; | |||
//System.out.println("typeName="+typeName+" subName="+subName); | |||
Triple triple = new Triple(Triple.VAR_ROLE_ID, subName, Globals.pd.typePredicateID, Triple.TYPE_ROLE_ID, typeName, null, 100); | |||
Sparql sparql = new Sparql(); | |||
sparql.addTriple(triple); | |||
qlog.rankedSparqls.add(sparql); | |||
} | |||
} | |||
} | |||
@@ -46,7 +46,9 @@ public class RelationFragment extends Fragment | |||
public static void load() throws Exception | |||
{ | |||
String filename = Globals.localPath + "data/DBpedia2016/fragments/predicate_RDF_fragment/predicate_fragment.txt"; | |||
System.out.println("Loading relation IDs and Fragments ..."); | |||
String filename = Globals.localPath + "data/pkubase/fragments/pkubase_predicate_fragment.txt"; | |||
List<String> inputs = FileUtil.readFile(filename); | |||
relFragments = new HashMap<Integer, ArrayList<RelationFragment>>(); | |||
literalRelationSet = new HashSet<Integer>(); | |||
@@ -72,7 +74,7 @@ public class RelationFragment extends Fragment | |||
public static void loadId() throws IOException | |||
{ | |||
String filename = Globals.localPath + "data/DBpedia2016/fragments/id_mappings/16predicate_id.txt"; | |||
String filename = Globals.localPath + "data/pkubase/fragments/id_mappings/pkubase_predicate_id.txt"; | |||
List<String> inputs = FileUtil.readFile(filename); | |||
relationShortName2IdList = new HashMap<String, ArrayList<Integer>>(); | |||
@@ -19,8 +19,6 @@ public class TypeFragment extends Fragment { | |||
public static HashMap<Integer, String> typeId2ShortName = null; | |||
public static final int NO_RELATION = -24232; | |||
public static HashSet<String> yagoTypeList = null; | |||
public HashSet<Integer> inEdges = new HashSet<Integer>(); | |||
public HashSet<Integer> outEdges = new HashSet<Integer>(); | |||
public HashSet<Integer> entSet = new HashSet<Integer>(); | |||
@@ -33,26 +31,6 @@ public class TypeFragment extends Fragment { | |||
* 4, others: peace、vice | |||
*/ | |||
public static ArrayList<String> stopYagoTypeList = null; | |||
static void loadStopYagoTypeList() | |||
{ | |||
stopYagoTypeList = new ArrayList<String>(); | |||
stopYagoTypeList.add("Amazon"); | |||
stopYagoTypeList.add("Earth"); | |||
stopYagoTypeList.add("TheHungerGames"); | |||
stopYagoTypeList.add("SparklingWine"); | |||
stopYagoTypeList.add("Type"); | |||
stopYagoTypeList.add("Flow"); | |||
stopYagoTypeList.add("Owner"); | |||
stopYagoTypeList.add("Series"); | |||
stopYagoTypeList.add("Shot"); | |||
stopYagoTypeList.add("Part"); | |||
stopYagoTypeList.add("Care"); | |||
stopYagoTypeList.add("Peace"); | |||
stopYagoTypeList.add("Vice"); | |||
stopYagoTypeList.add("Dodo"); | |||
stopYagoTypeList.add("CzechFilms"); | |||
stopYagoTypeList.add("ChineseFilms"); | |||
} | |||
public TypeFragment(String fgmt, int fid) | |||
{ | |||
@@ -100,7 +78,7 @@ public class TypeFragment extends Fragment { | |||
public static void load() throws Exception | |||
{ | |||
String filename = Globals.localPath+"data/DBpedia2016/fragments/class_RDF_fragment/16type_fragment.txt"; | |||
String filename = Globals.localPath+"data/pkubase/fragments/pkubase_type_fragment.txt"; | |||
File file = new File(filename); | |||
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); | |||
@@ -128,14 +106,13 @@ public class TypeFragment extends Fragment { | |||
// can fix some data there | |||
// load Type Id | |||
loadId(); | |||
System.out.println("Load "+typeId2ShortName.size()+" basic types and "+yagoTypeList.size()+" yago types."); | |||
System.out.println("Load "+typeId2ShortName.size()+" basic types."); | |||
} | |||
public static void loadId() throws IOException | |||
{ | |||
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16basic_types_id.txt"; | |||
String yagoFileName = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16yago_types_list.txt"; | |||
String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkubase_type_id.txt"; | |||
File file = new File(filename); | |||
InputStreamReader in = new InputStreamReader(new FileInputStream(file),"utf-8"); | |||
BufferedReader br = new BufferedReader(in); | |||
@@ -161,19 +138,5 @@ public class TypeFragment extends Fragment { | |||
typeId2ShortName.put(RelationFragment.literalTypeId, "literal_HRZ"); | |||
br.close(); | |||
//load YAGO types | |||
in = new InputStreamReader(new FileInputStream(yagoFileName),"utf-8"); | |||
br = new BufferedReader(in); | |||
yagoTypeList = new HashSet<String>(); | |||
while((line = br.readLine())!=null) | |||
{ | |||
String[] lines = line.split("\t"); | |||
String typeName = lines[0]; | |||
yagoTypeList.add(typeName); | |||
} | |||
loadStopYagoTypeList(); | |||
yagoTypeList.removeAll(stopYagoTypeList); | |||
} | |||
} |
@@ -1,119 +0,0 @@ | |||
package lcn; | |||
import java.io.BufferedReader; | |||
import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.InputStreamReader; | |||
import java.util.Date; | |||
import org.apache.lucene.analysis.Analyzer; | |||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||
import org.apache.lucene.document.Document; | |||
import org.apache.lucene.document.Field; | |||
import org.apache.lucene.index.IndexWriter; | |||
import qa.Globals; | |||
public class BuildIndexForEntityFragments{ | |||
public void indexforentity() throws Exception | |||
{ | |||
if(EntityFragmentFields.entityId2Name == null) | |||
EntityFragmentFields.load(); | |||
long startTime = new Date().getTime(); | |||
//Try update KB index to DBpedia2015. by husen 2016-04-08 | |||
//Try update KB index to DBpedia2016. by husen 2018-8-22 | |||
File indexDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/entity_fragment_index"); | |||
File sourceDir_en = new File("D:/husen/gAnswer/data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"); | |||
Analyzer luceneAnalyzer_en = new StandardAnalyzer(); | |||
IndexWriter indexWriter_en = new IndexWriter(indexDir_en, luceneAnalyzer_en,true); | |||
int mergeFactor = 100000; //default 10 | |||
int maxBufferedDoc = 1000; //default 10 | |||
int maxMergeDoc = Integer.MAX_VALUE; //INF | |||
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; | |||
indexWriter_en.setMergeFactor(mergeFactor); | |||
indexWriter_en.setMaxBufferedDocs(maxBufferedDoc); | |||
indexWriter_en.setMaxMergeDocs(maxMergeDoc); | |||
FileInputStream file = new FileInputStream(sourceDir_en); | |||
InputStreamReader in = new InputStreamReader(file,"UTF-8"); | |||
BufferedReader br = new BufferedReader(in); | |||
int count = 0; | |||
while(true) | |||
{ | |||
String _line = br.readLine(); | |||
{ | |||
if(_line == null) break; | |||
} | |||
count++; | |||
if(count % 100000 == 0) | |||
System.out.println(count); | |||
String line = _line; | |||
String temp[] = line.split("\t"); | |||
if(temp.length != 2) | |||
continue; | |||
else | |||
{ | |||
int entity_id = Integer.parseInt(temp[0]); | |||
if(!EntityFragmentFields.entityId2Name.containsKey(entity_id)) | |||
continue; | |||
String entity_name = EntityFragmentFields.entityId2Name.get(entity_id); | |||
String entity_fragment = temp[1]; | |||
entity_name = entity_name.replace("____", " "); | |||
entity_name = entity_name.replace("__", " "); | |||
entity_name = entity_name.replace("_", " "); | |||
Document document = new Document(); | |||
Field EntityName = new Field("EntityName", entity_name, Field.Store.YES, | |||
Field.Index.TOKENIZED, | |||
Field.TermVector.WITH_POSITIONS_OFFSETS); | |||
Field EntityId = new Field("EntityId", String.valueOf(entity_id), | |||
Field.Store.YES, Field.Index.NO); | |||
Field EntityFragment = new Field("EntityFragment", entity_fragment, | |||
Field.Store.YES, Field.Index.NO); | |||
document.add(EntityName); | |||
document.add(EntityId); | |||
document.add(EntityFragment); | |||
indexWriter_en.addDocument(document); | |||
} | |||
} | |||
indexWriter_en.optimize(); | |||
indexWriter_en.close(); | |||
br.close(); | |||
// input the time of Build index | |||
long endTime = new Date().getTime(); | |||
System.out.println("entity_name index has build ->" + count + " " + "Time:" + (endTime - startTime)); | |||
} | |||
public static void main(String[] args) | |||
{ | |||
BuildIndexForEntityFragments bef = new BuildIndexForEntityFragments(); | |||
try | |||
{ | |||
Globals.localPath="D:/husen/gAnswer/"; | |||
bef.indexforentity(); | |||
} | |||
catch (Exception e) | |||
{ | |||
e.printStackTrace(); | |||
} | |||
} | |||
} | |||
@@ -1,107 +0,0 @@ | |||
package lcn; | |||
import java.io.File; | |||
import java.util.ArrayList; | |||
import java.util.Date; | |||
import java.util.HashMap; | |||
import java.util.Iterator; | |||
import org.apache.lucene.analysis.Analyzer; | |||
import org.apache.lucene.analysis.standard.StandardAnalyzer; | |||
import org.apache.lucene.document.Document; | |||
import org.apache.lucene.document.Field; | |||
import org.apache.lucene.index.IndexWriter; | |||
import qa.Globals; | |||
import fgmt.TypeFragment; | |||
public class BuildIndexForTypeShortName { | |||
public static void buildIndex(HashMap<String, ArrayList<Integer>> typeShortName2IdList) throws Exception | |||
{ | |||
long startTime = new Date().getTime(); | |||
File indexDir_li = new File("D:/husen/gAnswer/data/DBpedia2016/lucene/type_fragment_index"); | |||
Analyzer luceneAnalyzer_li = new StandardAnalyzer(); | |||
IndexWriter indexWriter_li = new IndexWriter(indexDir_li, luceneAnalyzer_li,true); | |||
int mergeFactor = 100000; | |||
int maxBufferedDoc = 1000; | |||
int maxMergeDoc = Integer.MAX_VALUE; | |||
//indexWriter.DEFAULT_MERGE_FACTOR = mergeFactor; | |||
indexWriter_li.setMergeFactor(mergeFactor); | |||
indexWriter_li.setMaxBufferedDocs(maxBufferedDoc); | |||
indexWriter_li.setMaxMergeDocs(maxMergeDoc); | |||
int count = 0; | |||
Iterator<String> it = typeShortName2IdList.keySet().iterator(); | |||
while (it.hasNext()) | |||
{ | |||
String sn = it.next(); | |||
if (sn.length() == 0) { | |||
continue; | |||
} | |||
count ++; | |||
StringBuilder splittedSn = new StringBuilder(""); | |||
if(sn.contains("_")) | |||
{ | |||
String nsn = sn.replace("_", " "); | |||
splittedSn.append(nsn.toLowerCase()); | |||
} | |||
else | |||
{ | |||
int last = 0, i = 0; | |||
for(i = 0; i < sn.length(); i ++) | |||
{ | |||
// if it were not a small letter, then break it. | |||
if(!(sn.charAt(i)>='a' && sn.charAt(i)<='z')) | |||
{ | |||
splittedSn.append(sn.substring(last, i).toLowerCase()); | |||
splittedSn.append(' '); | |||
last = i; | |||
} | |||
} | |||
splittedSn.append(sn.substring(last, i).toLowerCase()); | |||
while(splittedSn.charAt(0) == ' ') { | |||
splittedSn.deleteCharAt(0); | |||
} | |||
} | |||
System.out.println("SplitttedType: "+splittedSn); | |||
Document document = new Document(); | |||
Field SplittedTypeShortName = new Field("SplittedTypeShortName", splittedSn.toString(), | |||
Field.Store.YES, | |||
Field.Index.TOKENIZED, | |||
Field.TermVector.WITH_POSITIONS_OFFSETS); | |||
Field TypeShortName = new Field("TypeShortName", sn, | |||
Field.Store.YES, Field.Index.NO); | |||
document.add(SplittedTypeShortName); | |||
document.add(TypeShortName); | |||
indexWriter_li.addDocument(document); | |||
} | |||
indexWriter_li.optimize(); | |||
indexWriter_li.close(); | |||
// input the time of Build index | |||
long endTime = new Date().getTime(); | |||
System.out.println("TypeShortName index has build ->" + count + " " + "Time:" + (endTime - startTime)); | |||
} | |||
public static void main (String[] args) { | |||
try { | |||
Globals.localPath="D:/husen/gAnswer/"; | |||
TypeFragment.load(); | |||
BuildIndexForTypeShortName.buildIndex(TypeFragment.typeShortName2IdList); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -5,9 +5,13 @@ import java.io.File; | |||
import java.io.FileInputStream; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import java.util.List; | |||
import fgmt.EntityFragment; | |||
import qa.Globals; | |||
import utils.FileUtil; | |||
public class EntityFragmentFields { | |||
@@ -18,8 +22,8 @@ public class EntityFragmentFields { | |||
public static void load() throws IOException | |||
{ | |||
String filename = Globals.localPath+"data/DBpedia2016/fragments/id_mappings/16entity_id.txt"; | |||
String fragmentFileName = Globals.localPath+"data/DBpedia2016/fragments/entity_RDF_fragment/16entity_fragment.txt"; | |||
String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkubase_entity_id.txt"; | |||
String fragmentFileName = Globals.localPath+"data/pkubase/fragments/pkubase_entity_fragment.txt"; | |||
File file = new File(filename); | |||
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"utf-8")); | |||
@@ -35,7 +39,7 @@ public class EntityFragmentFields { | |||
while((line = br.readLine()) != null) | |||
{ | |||
String[] lines = line.split("\t"); | |||
String entName = lines[0].substring(1, lines[0].length()-1); | |||
String entName = lines[0].trim().substring(1, lines[0].length()-1); | |||
entityName2Id.put(entName, Integer.parseInt(lines[1])); | |||
entityId2Name.put(Integer.parseInt(lines[1]), entName); | |||
@@ -61,4 +65,41 @@ public class EntityFragmentFields { | |||
br.close(); | |||
} | |||
public static void genmini() | |||
{ | |||
String filename = Globals.localPath+"data/pkubase/fragments/id_mappings/pkuentity_id.txt"; | |||
String fragmentFileName = Globals.localPath+"data/pkubase/fragments/pkubase_entity_fragment_mini.txt"; | |||
List<String> fragments = FileUtil.readFile(fragmentFileName); | |||
ArrayList<Integer> eids = new ArrayList<Integer>(); | |||
for(String fragment: fragments) | |||
{ | |||
int eid = Integer.parseInt(fragment.split("\t")[0]); | |||
String fgmt = fragment.split("\t")[1]; | |||
EntityFragment ef = new EntityFragment(eid, fgmt); | |||
eids.add(eid); | |||
for(int ent: ef.inEntMap.keySet()) | |||
{ | |||
eids.add(ent); | |||
} | |||
for(int ent: ef.outEntMap.keySet()) | |||
{ | |||
eids.add(ent); | |||
} | |||
} | |||
System.out.println(eids.size()); | |||
System.out.println("Loading entity id ..."); | |||
List<String> data = FileUtil.readFile(filename); | |||
for(String line: data) | |||
{ | |||
String[] lines = line.split("\t"); | |||
int eid = Integer.parseInt(lines[1]); | |||
if(eids.contains(eid)) | |||
System.out.println(line); | |||
} | |||
} | |||
public static void main(String[] args) { | |||
EntityFragmentFields.genmini(); | |||
} | |||
} |
@@ -12,7 +12,6 @@ import qa.Query; | |||
import rdf.EntityMapping; | |||
import rdf.SemanticRelation; | |||
import rdf.Sparql; | |||
import rdf.MergedWord; | |||
import rdf.SemanticUnit; | |||
import qa.Answer; | |||
import nlp.ds.Sentence; | |||
@@ -30,10 +29,8 @@ public class QueryLogger { | |||
public boolean MODE_debug = false; | |||
public boolean MODE_log = true; | |||
public boolean MODE_fragment = true; | |||
public boolean isMaltParserUsed = true; // Notice, we utilize Malt Parser as default parser, which is different from the older version. TODO: some coref rules need changed to fit Malt Parser. | |||
public boolean isMaltParserUsed = false; // MaltParser id deprecated. | |||
public HashMap<String, Integer> timeTable = null; | |||
public ArrayList<MergedWord> mWordList = null; | |||
public ArrayList<SemanticUnit> semanticUnitList = null; | |||
public HashMap<Integer, SemanticRelation> semanticRelations = null; | |||
public HashMap<Integer, SemanticRelation> potentialSemanticRelations = null; | |||
@@ -48,7 +45,6 @@ public class QueryLogger { | |||
{ | |||
timeTable = new HashMap<String, Integer>(); | |||
rankedSparqls = new ArrayList<Sparql>(); | |||
mWordList = query.mWordList; | |||
} | |||
public void reloadSentence(Sentence sentence) | |||
@@ -6,75 +6,37 @@ import java.util.HashMap; | |||
import java.util.List; | |||
import java.util.Stack; | |||
import nlp.tool.CoreNLP; | |||
import nlp.tool.MaltParser; | |||
import nlp.tool.StanfordParser; | |||
import org.maltparser.core.exception.MaltChainedException; | |||
import org.maltparser.core.syntaxgraph.DependencyStructure; | |||
import org.maltparser.core.syntaxgraph.node.DependencyNode; | |||
import edu.stanford.nlp.ling.CoreLabel; | |||
import edu.stanford.nlp.ling.IndexedWord; | |||
import edu.stanford.nlp.ling.SentenceUtils; | |||
import edu.stanford.nlp.trees.GrammaticalStructure; | |||
import edu.stanford.nlp.trees.TypedDependency; | |||
import edu.stanford.nlp.trees.semgraph.SemanticGraph; | |||
public class DependencyTree { | |||
public DependencyTreeNode root = null; | |||
public ArrayList<DependencyTreeNode> nodesList = null; | |||
public SemanticGraph dependencies = null; // Method 1: CoreNLP (discarded) | |||
public GrammaticalStructure gs = null; // Method 2: Stanford Parser | |||
public DependencyStructure maltGraph = null; // Method 3: MaltParser | |||
// public GrammaticalStructure gs = null; // Method 2: Stanford Parser | |||
public HashMap<String, ArrayList<DependencyTreeNode>> wordBaseFormIndex = null; | |||
public DependencyTree (Sentence sentence, CoreNLP coreNLPparser) { | |||
SemanticGraph dependencies = coreNLPparser.getBasicDependencies(sentence.plainText); | |||
this.dependencies = dependencies; | |||
Stack<IndexedWord> stack = new Stack<IndexedWord>(); | |||
IndexedWord iwRoot = dependencies.getFirstRoot(); | |||
HashMap<IndexedWord, DependencyTreeNode> map = new HashMap<IndexedWord, DependencyTreeNode>(); | |||
nodesList = new ArrayList<DependencyTreeNode>(); | |||
stack.push(iwRoot); | |||
root = this.setRoot(sentence.getWordByIndex(iwRoot.index())); | |||
map.put(iwRoot, root); | |||
while (!stack.empty()) | |||
{ | |||
IndexedWord curIWNode = stack.pop(); | |||
DependencyTreeNode curDTNode = map.get(curIWNode); | |||
for (IndexedWord iwChild : dependencies.getChildList(curIWNode)) { | |||
Word w = sentence.getWordByIndex(iwChild.index()); | |||
DependencyTreeNode newDTNode = this.insert( | |||
curDTNode, | |||
w, | |||
dependencies.reln(curIWNode, iwChild).getShortName()); | |||
map.put(iwChild, newDTNode); | |||
stack.push(iwChild); | |||
} | |||
curDTNode.sortChildrenList(); | |||
nodesList.add(curDTNode); | |||
} | |||
} | |||
public DependencyTree (Sentence sentence, StanfordParser stanfordParser) { | |||
this.gs = stanfordParser.getGrammaticalStructure(sentence.plainText); | |||
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>(); | |||
nodesList = new ArrayList<DependencyTreeNode>(); | |||
List<TypedDependency> tdl = gs.typedDependencies(false); | |||
// String[] sent = { "这", "是", "一个", "简单", "的", "句子", "。" }; | |||
String[] sent = sentence.getWordsArr(); | |||
List<CoreLabel> rawWords = SentenceUtils.toCoreLabelList(sent); | |||
List<TypedDependency> tdl = stanfordParser.getTypedDependencyList(rawWords); | |||
// 1. generate all nodes. | |||
for (TypedDependency td : tdl) { | |||
// gov | |||
if (!map.containsKey(td.gov().index()) && !td.reln().getShortName().equals("root")) { | |||
Word w = sentence.getWordByIndex(td.gov().index()); | |||
w.posTag = td.gov().tag(); // POS TAG | |||
DependencyTreeNode newNode = new DependencyTreeNode(w); | |||
map.put(td.gov().index(), newNode); | |||
nodesList.add(newNode); | |||
@@ -82,6 +44,7 @@ public class DependencyTree { | |||
// dep | |||
if (!map.containsKey(td.dep().index())) { | |||
Word w = sentence.getWordByIndex(td.dep().index()); | |||
w.posTag = td.dep().tag(); // POS TAG | |||
DependencyTreeNode newNode = new DependencyTreeNode(w); | |||
map.put(td.dep().index(), newNode); | |||
nodesList.add(newNode); | |||
@@ -118,139 +81,9 @@ public class DependencyTree { | |||
} | |||
} | |||
Collections.sort(nodesList, new DependencyTreeNodeComparator()); | |||
for (DependencyTreeNode dtn : nodesList) { | |||
dtn.linkNN(this); | |||
} | |||
} | |||
public DependencyTree (Sentence sentence, MaltParser maltParser)throws MaltChainedException { | |||
try { | |||
// the tokens are parsed in the following line | |||
DependencyStructure graph = maltParser.getDependencyStructure(sentence); | |||
this.maltGraph = graph; | |||
//System.out.println(graph); | |||
HashMap<Integer, DependencyTreeNode> map = new HashMap<Integer, DependencyTreeNode>(); | |||
ArrayList<DependencyTreeNode> list = new ArrayList<DependencyTreeNode>(); | |||
Stack<DependencyNode> stack = new Stack<DependencyNode>(); | |||
DependencyNode nroot = graph.getDependencyRoot(); | |||
stack.add(nroot); | |||
// 1. generate all nodes. | |||
while (!stack.isEmpty()) { | |||
DependencyNode n = stack.pop(); | |||
DependencyNode sib = n.getRightmostDependent(); | |||
int key = n.getIndex(); | |||
//System.out.println("[current node][key="+key+"] "+n+" <"+n.getHeadEdge()+">"); | |||
boolean flag = true; | |||
while (sib != null) { | |||
flag = false; | |||
stack.push(sib); | |||
sib = sib.getLeftSibling(); | |||
} | |||
if (flag) { | |||
sib = n.getLeftmostDependent(); | |||
while (sib != null) { | |||
stack.push(sib); | |||
sib = sib.getRightSibling(); | |||
} | |||
} | |||
if (n.hasHead() && !map.containsKey(key)) { | |||
//String snode = n.toString(); | |||
String sedge = n.getHeadEdge().toString(); | |||
//System.out.println("[" + snode + "] <" + sedge + ">"); | |||
/*int position = 0; | |||
String wordOriginal = null; | |||
String wordBase; | |||
String postag = null;*/ | |||
String dep = null; | |||
int idx1, idx2; | |||
/*// position | |||
idx1 = snode.indexOf("ID:")+3; | |||
idx2 = snode.indexOf(' ', idx1); | |||
position = Integer.parseInt(snode.substring(idx1, idx2)); | |||
// word | |||
idx1 = snode.indexOf("FORM:", idx2)+5; | |||
idx2 = snode.indexOf(' ', idx1); | |||
wordOriginal = snode.substring(idx1, idx2); | |||
wordBase = Globals.coreNLP.getBaseFormOfPattern(wordOriginal.toLowerCase()); | |||
// postag | |||
idx1 = snode.indexOf("POSTAG:", idx2)+7; | |||
idx2 = snode.indexOf(' ', idx1); | |||
postag = snode.substring(idx1, idx2);*/ | |||
// dep | |||
idx1 = sedge.lastIndexOf(':')+1; | |||
idx2 = sedge.lastIndexOf(' '); | |||
dep = sedge.substring(idx1, idx2); | |||
if (dep.equals("null")) { | |||
dep = null; | |||
} | |||
else if (dep.equals("punct")) {// No consider about punctuation | |||
continue; | |||
} | |||
DependencyTreeNode newNode = new DependencyTreeNode(sentence.getWordByIndex(key)); | |||
newNode.dep_father2child = dep; | |||
map.put(key, newNode); | |||
list.add(newNode); | |||
} | |||
} | |||
// 2. add edges | |||
for (Integer k : map.keySet()) { | |||
DependencyNode n = graph.getDependencyNode(k); | |||
DependencyTreeNode dtn = map.get(k); | |||
if (dtn.dep_father2child == null) { | |||
this.setRoot(dtn); | |||
this.root.levelInTree = 0; | |||
this.root.dep_father2child = "root"; | |||
} | |||
else { | |||
DependencyTreeNode father = map.get(n.getHead().getIndex()); | |||
DependencyTreeNode child = map.get(n.getIndex()); | |||
child.father = father; | |||
father.childrenList.add(child); | |||
} | |||
} | |||
// Fix the tree for some cases. | |||
if(list.size() > 11) | |||
{ | |||
DependencyTreeNode dt1 = list.get(11), dt2 = list.get(5); | |||
if(dt1!=null && dt2!=null && dt1.word.baseForm.equals("star") && dt1.father.word.baseForm.equals("be")) | |||
{ | |||
if (dt2.word.baseForm.equals("film") || dt2.word.baseForm.equals("movie")) | |||
{ | |||
dt1.father.childrenList.remove(dt1); | |||
dt1.father = dt2; | |||
dt2.childrenList.add(dt1); | |||
} | |||
} | |||
} | |||
// add levelInTree, sort childrenList & nodesList | |||
for (DependencyTreeNode dtn : list) { | |||
if (dtn.father != null) { | |||
dtn.levelInTree = dtn.father.levelInTree + 1; | |||
dtn.sortChildrenList(); | |||
} | |||
} | |||
nodesList = list; | |||
Collections.sort(nodesList, new DependencyTreeNodeComparator()); | |||
for (DependencyTreeNode dtn : nodesList) { | |||
dtn.linkNN(this); | |||
} | |||
} catch (MaltChainedException e) { | |||
//e.printStackTrace(); | |||
//System.err.println("MaltParser exception: " + e.getMessage()); | |||
throw e; | |||
} | |||
// for (DependencyTreeNode dtn : nodesList) { | |||
// dtn.linkNN(this); | |||
// } | |||
} | |||
public DependencyTreeNode setRoot(Word w) { | |||
@@ -2,10 +2,10 @@ package nlp.ds; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import java.util.List; | |||
import qa.Globals; | |||
import qa.Query; | |||
import rdf.MergedWord; | |||
public class Sentence { | |||
public String plainText = null; | |||
@@ -18,40 +18,64 @@ public class Sentence { | |||
public enum SentenceType {SpecialQuestion,GeneralQuestion,ImperativeSentence} | |||
public SentenceType sentenceType = SentenceType.SpecialQuestion; | |||
public Sentence (String s) | |||
// public Sentence (String s) | |||
// { | |||
// plainText = s; | |||
// words = Globals.coreNLP.getTaggedWords(plainText); | |||
// map = new HashMap<String, Word>(); | |||
// for (Word w : words) | |||
// map.put(w.key, w); | |||
// } | |||
// for tokenized sentence | |||
public Sentence (List<Word> wordList, String s) | |||
{ | |||
plainText = s; | |||
words = Globals.coreNLP.getTaggedWords(plainText); | |||
words = new Word[wordList.size()]; | |||
for(int i=0; i<wordList.size(); i++) | |||
words[i] = wordList.get(i); | |||
map = new HashMap<String, Word>(); | |||
for (Word w : words) | |||
map.put(w.key, w); | |||
} | |||
public Sentence (Query query, String s) | |||
{ | |||
plainText = s; | |||
words = Globals.coreNLP.getTaggedWords(plainText); | |||
// inherit NodeRecognition's information | |||
for(Word word: words) | |||
// public Sentence (Query query, String s) | |||
// { | |||
// plainText = s; | |||
// words = Globals.coreNLP.getTaggedWords(plainText); | |||
// // inherit NodeRecognition's information | |||
// for(Word word: words) | |||
// { | |||
// for(MergedWord mWord: query.mWordList) | |||
// { | |||
// if(word.originalForm.equals(mWord.name)) | |||
// { | |||
// word.mayLiteral = mWord.mayLiteral; | |||
// word.mayEnt = mWord.mayEnt; | |||
// word.mayType = mWord.mayType; | |||
// word.mayCategory = mWord.mayCategory; | |||
// word.tmList = mWord.tmList; | |||
// word.emList = mWord.emList; | |||
// word.category = mWord.category; | |||
// } | |||
// } | |||
// } | |||
// map = new HashMap<String, Word>(); | |||
// for (Word w : words) | |||
// map.put(w.key, w); | |||
// } | |||
public String[] getWordsArr() { | |||
String[] wordArr = new String[words.length]; | |||
int cnt = 0; | |||
for(Word w: words) | |||
{ | |||
for(MergedWord mWord: query.mWordList) | |||
{ | |||
if(word.originalForm.equals(mWord.name)) | |||
{ | |||
word.mayLiteral = mWord.mayLiteral; | |||
word.mayEnt = mWord.mayEnt; | |||
word.mayType = mWord.mayType; | |||
word.mayCategory = mWord.mayCategory; | |||
word.tmList = mWord.tmList; | |||
word.emList = mWord.emList; | |||
word.category = mWord.category; | |||
} | |||
} | |||
wordArr[cnt++] = w.originalForm; | |||
} | |||
map = new HashMap<String, Word>(); | |||
for (Word w : words) | |||
map.put(w.key, w); | |||
return wordArr; | |||
} | |||
public ArrayList<Word> getWordsByString (String w) { | |||
ArrayList<Word> ret = new ArrayList<Word>(); | |||
for (Word wo: words) { | |||
@@ -1,201 +0,0 @@ | |||
package nlp.tool; | |||
import java.util.List; | |||
import java.util.Properties; | |||
import nlp.ds.Word; | |||
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; | |||
import edu.stanford.nlp.ling.CoreLabel; | |||
import edu.stanford.nlp.pipeline.Annotation; | |||
import edu.stanford.nlp.pipeline.StanfordCoreNLP; | |||
import edu.stanford.nlp.trees.Tree; | |||
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; | |||
import edu.stanford.nlp.trees.semgraph.SemanticGraph; | |||
import edu.stanford.nlp.trees.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation; | |||
import edu.stanford.nlp.util.CoreMap; | |||
public class CoreNLP { | |||
// CoreNLP can also recognize TIME and NUMBER (see SUTime) | |||
private StanfordCoreNLP pipeline_lemma; | |||
public CoreNLP () { | |||
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution | |||
/*Properties props_all = new Properties(); | |||
props_all.put("annotators", "tokenize, ssplit, pos, lemma, parse"); // full list: "tokenize, ssplit, pos, lemma, ner, parse, dcoref" | |||
pipeline_all = new StanfordCoreNLP(props_all);*/ | |||
Properties props_lemma = new Properties(); | |||
props_lemma.put("annotators", "tokenize, ssplit, pos, lemma"); | |||
pipeline_lemma = new StanfordCoreNLP(props_lemma); | |||
} | |||
// For more efficient usage, refer to "http://www.jarvana.com/jarvana/view/edu/stanford/nlp/stanford-corenlp/1.2.0/stanford-corenlp-1.2.0-javadoc.jar!/edu/stanford/nlp/process/Morphology.html" | |||
public String getBaseFormOfPattern (String text) { | |||
String ret = new String(""); | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(text); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
int count = 0; | |||
for(CoreMap sentence: sentences) { | |||
// traversing the words in the current sentence | |||
// a CoreLabel is a CoreMap with additional token-specific methods | |||
for (CoreLabel token: sentence.get(TokensAnnotation.class)) { | |||
// this is the base form (lemma) of the token | |||
String lemma = token.getString(LemmaAnnotation.class); | |||
ret += lemma; | |||
ret += " "; | |||
} | |||
count ++; | |||
if (count % 100 == 0) { | |||
System.out.println(count); | |||
} | |||
} | |||
return ret.substring(0, ret.length()-1); | |||
} | |||
public SemanticGraph getBasicDependencies (String s) { | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(s); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// this is the Stanford dependency graph of the current sentence | |||
SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class); | |||
return dependencies; | |||
} | |||
return null; | |||
} | |||
public Tree getParseTree (String text) { | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(text); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// this is the parse tree of the current sentence | |||
return sentence.get(TreeAnnotation.class); | |||
} | |||
return null; | |||
} | |||
/** | |||
* How to use: | |||
* for (CoreLabel token : sentence.get(TokensAnnotation.class)) { | |||
* // this is the text of the token | |||
* String word = token.get(TextAnnotation.class); | |||
* // this is the POS tag of the token | |||
* String pos = token.get(PartOfSpeechAnnotation.class); | |||
* } | |||
* @param s | |||
* @return | |||
*/ | |||
public CoreMap getPOS (String s) { | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(s); | |||
// run all Annotators on this text | |||
pipeline_lemma.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// this is the sentence with POS Tags | |||
return sentence; | |||
} | |||
return null; | |||
} | |||
public Word[] getTaggedWords (String sentence) { | |||
CoreMap taggedSentence = getPOS(sentence); | |||
Word[] ret = new Word[taggedSentence.get(TokensAnnotation.class).size()]; | |||
int count = 0; | |||
for (CoreLabel token : taggedSentence.get(TokensAnnotation.class)) { | |||
// this is the text of the token | |||
String word = token.get(TextAnnotation.class); | |||
// this is the POS tag of the token | |||
String pos = token.get(PartOfSpeechAnnotation.class); | |||
//System.out.println(word+"["+pos+"]"); | |||
ret[count] = new Word(getBaseFormOfPattern(word.toLowerCase()), word, pos, count+1); | |||
count ++; | |||
} | |||
return ret; | |||
} | |||
/*public void demo () { | |||
// creates a StanfordCoreNLP object, with POS tagging, lemmatization, NER, parsing, and coreference resolution | |||
Properties props = new Properties(); | |||
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); | |||
StanfordCoreNLP pipeline = new StanfordCoreNLP(props); | |||
// read some text in the text variable | |||
String text = ... // Add your text here! | |||
// create an empty Annotation just with the given text | |||
Annotation document = new Annotation(text); | |||
// run all Annotators on this text | |||
pipeline.annotate(document); | |||
// these are all the sentences in this document | |||
// a CoreMap is essentially a Map that uses class objects as keys and has values with custom types | |||
List<CoreMap> sentences = document.get(SentencesAnnotation.class); | |||
for(CoreMap sentence: sentences) { | |||
// traversing the words in the current sentence | |||
// a CoreLabel is a CoreMap with additional token-specific methods | |||
for (CoreLabel token: sentence.get(TokensAnnotation.class)) { | |||
// this is the text of the token | |||
String word = token.get(TextAnnotation.class); | |||
// this is the POS tag of the token | |||
String pos = token.get(PartOfSpeechAnnotation.class); | |||
// this is the NER label of the token | |||
String ne = token.get(NamedEntityTagAnnotation.class); | |||
} | |||
// this is the parse tree of the current sentence | |||
Tree tree = sentence.get(TreeAnnotation.class); | |||
// this is the Stanford dependency graph of the current sentence | |||
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class); | |||
} | |||
// This is the coreference link graph | |||
// Each chain stores a set of mentions that link to each other, | |||
// along with a method for getting the most representative mention | |||
// Both sentence and token offsets start at 1! | |||
Map<Integer, CorefChain> graph = | |||
document.get(CorefChainAnnotation.class); | |||
}*/ | |||
} |
@@ -21,13 +21,10 @@ public class Main { | |||
break; | |||
try { | |||
long t1 = System.currentTimeMillis(); | |||
Sentence s = new Sentence(question); | |||
Sentence s = null; | |||
DependencyTree dt = new DependencyTree(s, Globals.stanfordParser); | |||
System.out.println("====StanfordDependencies===="); | |||
System.out.println(dt); | |||
DependencyTree dt2 = new DependencyTree(s, Globals.maltParser); | |||
System.out.println("====MaltDependencies===="); | |||
System.out.println(dt2); | |||
long t2 = System.currentTimeMillis(); | |||
System.out.println("time=" + (t2-t1) + "ms"); | |||
} catch (Exception e) { | |||
@@ -1,70 +0,0 @@ | |||
package nlp.tool; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Word; | |||
import org.maltparser.MaltParserService; | |||
import org.maltparser.core.exception.MaltChainedException; | |||
import org.maltparser.core.syntaxgraph.DependencyStructure; | |||
import qa.Globals; | |||
public class MaltParser { | |||
private MaltParserService service = null; | |||
public MaltParser() { | |||
try | |||
{ | |||
System.out.print("Loading MaltParser ..."); | |||
service = new MaltParserService(); | |||
// Inititalize the parser model 'model0' and sets the working directory to '.' and sets the logging file to 'parser.log' | |||
//service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w . -lfi parser.log"); | |||
service.initializeParserModel("-c engmalt.linear-1.7 -m parse -w "+Globals.localPath+"lib/maltparser-1.9.1 -lfi parser.log"); | |||
firstParse(); | |||
System.out.println("ok!"); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
System.err.println("MaltParser exception: " + e.getMessage()); | |||
} | |||
} | |||
private void firstParse() { | |||
String[] tokens = new String[12]; | |||
tokens[0] = "1\tIn\t_\tIN\tIN\t_"; | |||
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; | |||
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; | |||
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; | |||
tokens[4] = "5\tby\t_\tIN\tIN\t_"; | |||
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; | |||
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; | |||
tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; | |||
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; | |||
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; | |||
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; | |||
tokens[11] = "12\t?\t_\t.\t.\t_"; | |||
try { | |||
service.parse(tokens); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
public DependencyStructure getDependencyStructure (Sentence sentence) { | |||
try { | |||
return service.parse(getTaggedTokens(sentence)); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
} | |||
return null; | |||
} | |||
private String[] getTaggedTokens (Sentence sentence) { | |||
String[] ret = new String[sentence.words.length]; | |||
int count = 0; | |||
for (Word w : sentence.words) { | |||
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); | |||
count ++; | |||
} | |||
return ret; | |||
} | |||
} |
@@ -1,73 +0,0 @@ | |||
package nlp.tool; | |||
import java.io.File; | |||
import java.net.URL; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Word; | |||
import org.maltparser.concurrent.ConcurrentMaltParserModel; | |||
import org.maltparser.concurrent.ConcurrentMaltParserService; | |||
import org.maltparser.concurrent.graph.ConcurrentDependencyGraph; | |||
import org.maltparser.core.exception.MaltChainedException; | |||
//import org.maltparser.core.syntaxgraph.DependencyStructure; | |||
public class MaltParserCon { | |||
private ConcurrentMaltParserModel model = null; | |||
public ConcurrentDependencyGraph outputGraph = null; | |||
public MaltParserCon(){ | |||
try{ | |||
System.out.println("Loading Maltparser...\n"); | |||
URL ModelURL = new File("output/engmalt.linear-1.7.mco").toURI().toURL(); | |||
model = ConcurrentMaltParserService.initializeParserModel(ModelURL); | |||
firstTest(); | |||
System.out.println("ok!\n"); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
System.err.println("MaltParser exception: " + e.getMessage()); | |||
} | |||
} | |||
private void firstTest(){ | |||
String[] tokens = new String[12]; | |||
tokens[0] = "1\tIn\t_\tIN\tIN\t_"; | |||
tokens[1] = "2\twhich\t_\tWDT\tWDT\t_"; | |||
tokens[2] = "3\tmovies\t_\tNNS\tNNS\t_"; | |||
tokens[3] = "4\tdirected\t_\tVBN\tVBN\t_"; | |||
tokens[4] = "5\tby\t_\tIN\tIN\t_"; | |||
tokens[5] = "6\tGarry\t_\tNNP\tNNP\t_"; | |||
tokens[6] = "7\tMarshall\t_\tNNP\tNNP\t_"; | |||
tokens[7] = "8\twas\t_\tVBD\tVBD\t_"; | |||
tokens[8] = "9\tJulia\t_\tNNP\tNNP\t_"; | |||
tokens[9] = "10\tRoberts\t_\tNNP\tNNP\t_"; | |||
tokens[10] = "11\tstarring\t_\tVBG\tVBG\t_"; | |||
tokens[11] = "12\t?\t_\t.\t.\t_"; | |||
try { | |||
outputGraph = model.parse(tokens); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
System.out.println(outputGraph); | |||
} | |||
public ConcurrentDependencyGraph getDependencyStructure (Sentence sentence) { | |||
try { | |||
return model.parse(getTaggedTokens(sentence)); | |||
} catch (MaltChainedException e) { | |||
e.printStackTrace(); | |||
} | |||
return null; | |||
} | |||
private String[] getTaggedTokens (Sentence sentence) { | |||
String[] ret = new String[sentence.words.length]; | |||
int count = 0; | |||
for (Word w : sentence.words) { | |||
ret[count] = new String(""+w.position+"\t"+w.originalForm+"\t_\t"+w.posTag+"\t"+w.posTag+"\t_"); | |||
count ++; | |||
} | |||
return ret; | |||
} | |||
} |
@@ -1,53 +0,0 @@ | |||
package nlp.tool; | |||
import java.util.List; | |||
import qa.Globals; | |||
import nlp.ds.Sentence; | |||
import nlp.ds.Word; | |||
import edu.stanford.nlp.ie.AbstractSequenceClassifier; | |||
import edu.stanford.nlp.ie.crf.CRFClassifier; | |||
import edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation; | |||
import edu.stanford.nlp.ling.CoreAnnotations.PositionAnnotation; | |||
import edu.stanford.nlp.ling.CoreLabel; | |||
public class NERecognizer { | |||
static String serializedClassifier; | |||
static AbstractSequenceClassifier<CoreLabel> classifier; | |||
//public static String localPath="E:\\Hanshuo\\gAnswer\\"; | |||
public NERecognizer() { | |||
serializedClassifier = Globals.localPath+"lib/stanford-ner-2012-11-11/classifiers/english.all.3class.distsim.crf.ser.gz"; | |||
classifier = CRFClassifier.getClassifierNoExceptions(serializedClassifier); | |||
} | |||
/*public NERecognizer(String basePath, boolean flag) { | |||
serializedClassifier = "WEB-INF\\lib\\stanford-ner-2012-11-11\\stanford-ner-2012-11-11\\classifiers\\english.all.3class.distsim.crf.ser.gz"; | |||
}*/ | |||
public void recognize(Sentence sentence) { | |||
List<CoreLabel> lcl = classifier.classify(sentence.plainText).get(0); | |||
for (CoreLabel cl : lcl) { | |||
int position = Integer.parseInt(cl.get(PositionAnnotation.class))+1; | |||
Word w = sentence.getWordByIndex(position); | |||
String ner = cl.get(AnswerAnnotation.class); | |||
if (ner.equals("O")) w.ner = null; | |||
else w.ner = ner; | |||
} | |||
} | |||
public static void main(String[] args) { | |||
System.out.println("Test NER"); | |||
Globals.init(); | |||
Sentence s = new Sentence("I go to school at Stanford University, which is located in California.");//"Which states of Germany are governed by the Social Democratic Party?" | |||
Globals.nerRecognizer.recognize(s); | |||
for (Word word : s.words) { | |||
System.out.print(word + " "); | |||
System.out.println("ner=" + word.ner); | |||
} | |||
} | |||
} |
@@ -4,7 +4,6 @@ import java.io.StringReader; | |||
import java.util.List; | |||
import edu.stanford.nlp.ling.CoreLabel; | |||
import edu.stanford.nlp.objectbank.TokenizerFactory; | |||
import edu.stanford.nlp.parser.lexparser.LexicalizedParser; | |||
import edu.stanford.nlp.process.CoreLabelTokenFactory; | |||
import edu.stanford.nlp.process.PTBTokenizer; | |||
@@ -13,39 +12,40 @@ import edu.stanford.nlp.trees.GrammaticalStructureFactory; | |||
import edu.stanford.nlp.trees.PennTreebankLanguagePack; | |||
import edu.stanford.nlp.trees.Tree; | |||
import edu.stanford.nlp.trees.TreebankLanguagePack; | |||
import edu.stanford.nlp.trees.TypedDependency; | |||
import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure; | |||
public class StanfordParser { | |||
private LexicalizedParser lp; | |||
private TokenizerFactory<CoreLabel> tokenizerFactory; | |||
private TreebankLanguagePack tlp; | |||
private GrammaticalStructureFactory gsf; | |||
private ChineseGrammaticalStructure gs; | |||
// private TokenizerFactory<CoreLabel> tokenizerFactory; | |||
// private TreebankLanguagePack tlp; | |||
// private GrammaticalStructureFactory gsf; | |||
public StanfordParser() { | |||
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); | |||
tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); | |||
tlp = new PennTreebankLanguagePack(); | |||
gsf = tlp.grammaticalStructureFactory(); | |||
// lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"); | |||
// tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); | |||
// tlp = new PennTreebankLanguagePack(); | |||
// gsf = tlp.grammaticalStructureFactory(); | |||
lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz"); | |||
} | |||
public GrammaticalStructure getGrammaticalStructure (String sentence) { | |||
List<CoreLabel> rawWords2 = | |||
tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); | |||
// Converts a Sentence/List/String into a Tree. | |||
// In all circumstances, the input will be treated as a single sentence to be parsed. | |||
Tree parse = lp.apply(rawWords2); | |||
return gsf.newGrammaticalStructure(parse); | |||
/*List<TypedDependency> tdl = gs.typedDependencies(false); | |||
for (TypedDependency td : tdl) { | |||
System.out.println(td.reln().getShortName()+"("+td.gov()+","+td.dep()+")"); | |||
System.out.println("gov="+td.gov() | |||
+"\tgov.index=" | |||
+td.gov().index() | |||
+"\tgov.value=" | |||
+td.gov().value() | |||
+"\tgov.pos=" | |||
+((TreeGraphNode)td.gov().parent()).value()); | |||
}*/ | |||
//System.out.println(tdl); | |||
// public GrammaticalStructure getGrammaticalStructure (String sentence) { | |||
// List<CoreLabel> rawWords2 = | |||
// tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize(); | |||
// | |||
// Tree parse = lp.apply(rawWords2); | |||
// | |||
// return gsf.newGrammaticalStructure(parse); | |||
// } | |||
public List<TypedDependency> getTypedDependencyList(List<CoreLabel> rawWords) | |||
{ | |||
Tree parse = lp.apply(rawWords); | |||
gs = new ChineseGrammaticalStructure(parse); | |||
return gs.typedDependenciesCCprocessed(); | |||
} | |||
} |
@@ -10,19 +10,17 @@ import java.util.Collections; | |||
import java.util.HashMap; | |||
import java.util.HashSet; | |||
import java.util.Iterator; | |||
import java.util.List; | |||
import com.huaban.analysis.jieba.SegToken; | |||
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; | |||
import nlp.tool.CoreNLP; | |||
import qa.Globals; | |||
import qa.extract.EntityRecognitionCh; | |||
public class ParaphraseDictionary { | |||
public static String localDataPath; | |||
public static String dbpedia_relation_paraphrases_baseform_withScore; | |||
public static String dbpedia_relation_paraphrases_baseform_withScore_rerank; | |||
public static String dbpedia_relation_paraphrases_handwrite; | |||
public static String dbpedia_predicate_id; | |||
public static String relation_paraphrases_path; | |||
public static String predicate_id_path; | |||
public static String dbpedia_dbo_predicate; | |||
public HashMap<String, Integer> predicate_2_id = null; | |||
@@ -41,24 +39,14 @@ public class ParaphraseDictionary { | |||
public int paraphrasedPredCount = 0; | |||
public int lineCount = 0; | |||
/** | |||
* constructor | |||
* @param parser | |||
* @param ner | |||
*/ | |||
public ParaphraseDictionary () { | |||
String fixedPath = Globals.localPath; | |||
String fixedPath = Globals.localPath+"data/pkubase/"; | |||
System.out.println(System.getProperty("user.dir")); | |||
localDataPath = fixedPath + "data/DBpedia2016/parapharse/"; | |||
dbpedia_relation_paraphrases_baseform_withScore_rerank = localDataPath + "dbpedia-relation-paraphrases-withScore-baseform-merge-sorted-rerank-slct.txt"; | |||
dbpedia_relation_paraphrases_handwrite = localDataPath + "dbpedia-relation-paraphrase-handwrite.txt"; | |||
dbpedia_predicate_id = localDataPath + "16predicate_id.txt"; | |||
dbpedia_dbo_predicate = localDataPath + "16dbo_predicates.txt"; | |||
relation_paraphrases_path = fixedPath + "paraphrase/pkubase-paraphrase.txt"; | |||
predicate_id_path = fixedPath + "fragments/id_mappings/pkubase_predicate_id.txt"; | |||
bannedTypes = new HashSet<String>(); | |||
bannedTypes.add("Mayor"); | |||
relns_subject = new HashSet<String>(); | |||
relns_subject.add("subj"); | |||
@@ -76,25 +64,16 @@ public class ParaphraseDictionary { | |||
relns_object.add("obj"); | |||
relns_object.add("pobj"); | |||
prepositions = new HashSet<String>(); | |||
prepositions.add("in");//in at on with to from before after of for | |||
prepositions.add("at"); | |||
prepositions.add("on"); | |||
prepositions.add("with"); | |||
prepositions.add("to"); | |||
prepositions.add("from"); | |||
prepositions.add("before"); | |||
prepositions.add("after"); | |||
prepositions.add("of"); | |||
prepositions.add("for"); | |||
prepositions.add("as"); | |||
prepositions = new HashSet<String>(); //TODO: safe delete | |||
try { | |||
loadPredicateId(); | |||
loadDboPredicate(); | |||
loadParaDict(); | |||
addPredicateAsNLPattern(); | |||
addHandwriteAsNLPattern(); | |||
// loadDboPredicate(); | |||
// loadParaDict(); | |||
buildInvertedIndex(); | |||
typePredicateID = predicate_2_id.get("type"); | |||
typePredicateID = predicate_2_id.get("类型"); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
@@ -108,8 +87,7 @@ public class ParaphraseDictionary { | |||
predicate_2_id = new HashMap<String, Integer>(); | |||
id_2_predicate = new HashMap<Integer, String>(); | |||
String input_filename = dbpedia_predicate_id; | |||
File file = new File(input_filename); | |||
File file = new File(predicate_id_path); | |||
InputStreamReader in = null; | |||
BufferedReader br = null; | |||
try{ | |||
@@ -118,6 +96,8 @@ public class ParaphraseDictionary { | |||
String line = null; | |||
while ((line = br.readLine())!= null) { | |||
String[] lines = line.split("\t"); | |||
if(lines[0].startsWith("<") && lines[0].endsWith(">")) | |||
lines[0] = lines[0].substring(1, lines[0].length()-1); | |||
predicate_2_id.put(lines[0], Integer.parseInt(lines[1])); | |||
id_2_predicate.put(Integer.parseInt(lines[1]), lines[0]); | |||
} | |||
@@ -192,13 +172,10 @@ public class ParaphraseDictionary { | |||
InputStreamReader in = null; | |||
BufferedReader br = null; | |||
try{ | |||
String inputFileName = dbpedia_relation_paraphrases_baseform_withScore_rerank; | |||
File file = new File(inputFileName); | |||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||
in = new InputStreamReader(new FileInputStream(new File(relation_paraphrases_path)), "utf-8"); | |||
br = new BufferedReader(in); | |||
String line = null; | |||
int lineCount = 0; | |||
//line = br.readLine();//read the first line which indicates the format | |||
while ((line = br.readLine()) != null) | |||
{ | |||
if (line.startsWith("#")) continue; | |||
@@ -259,72 +236,23 @@ public class ParaphraseDictionary { | |||
* A set of very important NL patterns are the predicates themselves! | |||
*/ | |||
public void addPredicateAsNLPattern () { | |||
if(nlPattern_2_predicateList == null) | |||
nlPattern_2_predicateList = new HashMap<String, ArrayList<PredicateIDAndSupport>>(); | |||
final int support = 200; | |||
int predicate_id; | |||
for (String p : predicate_2_id.keySet()) | |||
{ | |||
// TODO: Omitting some bad relations (should be discarded in future) | |||
if(p.equals("state") || p.equals("states")) | |||
continue; | |||
predicate_id = predicate_2_id.get(p); | |||
StringBuilder pattern = new StringBuilder(""); | |||
// Work/runtime 11,SpaceStation/volume 68 and some predicates have prefix (DBpedia 2015), discard the prefix when generating pattern | |||
if(p.contains("/")) | |||
// TODO: segmentation: 1) tokenize 2) single ch-word | |||
String patternString = ""; | |||
List<SegToken> q=EntityRecognitionCh.segmenter.process(p, SegMode.SEARCH); | |||
for (SegToken t:q) | |||
{ | |||
if(p.charAt(0)>='A' && p.charAt(0)<='Z') | |||
p = p.substring(p.indexOf("/")+1); | |||
//gameW/l 1974 | |||
else | |||
p = p.replace("/", ""); | |||
} | |||
int last = 0, i = 0; | |||
for(i = 0; i < p.length(); i ++) { | |||
// if it were not a small letter, then break it. | |||
if(!(p.charAt(i)>='a' && p.charAt(i)<='z')) { | |||
pattern.append(p.substring(last, i).toLowerCase()); | |||
pattern.append(" "); | |||
last = i; | |||
} | |||
patternString += t.word + " "; | |||
} | |||
pattern.append(p.substring(last, i).toLowerCase()); | |||
for (i = 3; i < pattern.length(); i ++) { | |||
// the blank between two digits should be deleted. | |||
if (pattern.charAt(i)>='0' && pattern.charAt(i)<='9' | |||
&& pattern.charAt(i-1)==' ' | |||
&& pattern.charAt(i-2)>='0' && pattern.charAt(i-2)<='9') { | |||
pattern.deleteCharAt(i-1); | |||
} | |||
// the blank between I and D should be deleted. | |||
else if (pattern.charAt(i)=='d' | |||
&& pattern.charAt(i-1)==' ' | |||
&& pattern.charAt(i-2)=='i' | |||
&& pattern.charAt(i-3)==' ') { | |||
pattern.deleteCharAt(i-1); | |||
} | |||
// the blank between D and B should be deleted. | |||
else if (pattern.charAt(i)=='b' | |||
&& pattern.charAt(i-1)==' ' | |||
&& pattern.charAt(i-2)=='d' | |||
&& pattern.charAt(i-3)==' ') { | |||
pattern.deleteCharAt(i-1); | |||
} | |||
} | |||
// pattern -> base form | |||
/*String[] ptns = pattern.toString().split(" "); | |||
pattern = new StringBuilder(""); | |||
for (String s : ptns) { | |||
pattern.append(Globals.coreNLPparser.getBaseFormOfPattern(s)); | |||
pattern.append(" "); | |||
} | |||
pattern.deleteCharAt(pattern.length()-1); | |||
String patternString = pattern.toString();*/ | |||
// Special case cannot use base form, eg, foundingYear //TODO: maybe Porter's Algorithm | |||
String patternString = Globals.coreNLP.getBaseFormOfPattern(pattern.toString()); | |||
patternString = patternString.trim(); | |||
//System.out.println(p + "-->" + patternString); | |||
if (!nlPattern_2_predicateList.containsKey(patternString)) { | |||
@@ -340,30 +268,39 @@ public class ParaphraseDictionary { | |||
} | |||
public void addHandwriteAsNLPattern() throws IOException { | |||
String inputFileName = dbpedia_relation_paraphrases_handwrite; | |||
InputStreamReader in = null; | |||
BufferedReader br = null; | |||
try{ | |||
File file = new File(inputFileName); | |||
in = new InputStreamReader(new FileInputStream(file), "utf-8"); | |||
in = new InputStreamReader(new FileInputStream(new File(relation_paraphrases_path)), "utf-8"); | |||
br = new BufferedReader(in); | |||
String line = null; | |||
//int lineCount = 0; | |||
//line = br.readLine();//read the first line which indicates the format | |||
while ((line = br.readLine()) != null) { | |||
if (line.startsWith("#") || line.isEmpty()) continue; | |||
//lineCount ++; | |||
String[] content = line.split("\t"); | |||
if(!predicate_2_id.containsKey(content[0])) | |||
continue; | |||
int predicateID = predicate_2_id.get(content[0]); | |||
String nlPattern = content[1].toLowerCase(); | |||
String nlPattern = content[1]; | |||
int support = Integer.parseInt(content[2]); | |||
// Need Segmentation | |||
if(!nlPattern.contains(" ")) | |||
{ | |||
String patternString = ""; | |||
List<SegToken> q=EntityRecognitionCh.segmenter.process(nlPattern, SegMode.SEARCH); | |||
for (SegToken t:q) | |||
{ | |||
patternString += t.word + " "; | |||
} | |||
patternString = patternString.trim(); | |||
nlPattern = patternString; | |||
} | |||
if (!nlPattern_2_predicateList.containsKey(nlPattern)) { | |||
nlPattern_2_predicateList.put(nlPattern, new ArrayList<PredicateIDAndSupport>()); | |||
} | |||
@@ -434,7 +371,7 @@ public class ParaphraseDictionary { | |||
} | |||
public static void main (String[] args) { | |||
Globals.coreNLP = new CoreNLP(); | |||
// Globals.coreNLP = new CoreNLP(); | |||
Globals.pd = new ParaphraseDictionary(); | |||
//Globals.pd.showNLPatterns(); | |||
} | |||
@@ -32,8 +32,8 @@ public class GAnswer { | |||
QueryLogger qlog = null; | |||
try | |||
{ | |||
if (input.length() <= 5) | |||
return null; | |||
// if (input.length() <= 5) | |||
// return null; | |||
System.out.println("[Input:] "+input); | |||
@@ -47,17 +47,17 @@ public class GAnswer { | |||
// Try to solve each NR plan, and combine the ranked SPARQLs. | |||
// We only reserve LOG of BEST NR plan for convenience. | |||
// Now only 1 plan | |||
for(int i=query.sList.size()-1; i>=0; i--) | |||
{ | |||
Sentence possibleSentence = query.sList.get(i); | |||
qlog.reloadSentence(possibleSentence); | |||
// qlog.isMaltParserUsed = true; | |||
// LOG | |||
System.out.println("transQ: "+qlog.s.plainText); | |||
qlog.NRlog = query.preLog; | |||
// qlog.NRlog = query.preLog; | |||
qlog.SQGlog = "Id: "+query.queryId+"\nQuery: "+query.NLQuestion+"\n"; | |||
qlog.SQGlog += qlog.NRlog; | |||
// qlog.SQGlog += qlog.NRlog; | |||
qlog.timeTable.put("step0", (int)NRtime); | |||
// step 1: question parsing (dependency tree, sentence type) | |||
@@ -91,7 +91,7 @@ public class GAnswer { | |||
qlog.rankedSparqls = rankedSparqls; | |||
System.out.println("number of rankedSparqls = " + qlog.rankedSparqls.size()); | |||
// Detect question focus. | |||
// Detect question focus. TODO: in which cases the question focus != target? | |||
for (int i=0; i<qlog.rankedSparqls.size(); i++) | |||
{ | |||
// First detect by SPARQLs. | |||
@@ -156,7 +156,7 @@ public class GAnswer { | |||
{ | |||
// modified by Lin Yinnian using ghttp - 2018-9-28 | |||
GstoreConnector gc = new GstoreConnector(Globals.QueryEngineIP, Globals.QueryEnginePort); | |||
String answer = gc.query("root", "123456", "dbpedia16", spq.toStringForGStore2()); | |||
String answer = gc.query("endpoint", "123", "pkubase", spq.toStringForGStore2()); | |||
System.out.println(answer); | |||
String[] rawLines = answer.split("\n"); | |||
@@ -199,9 +199,13 @@ public class GAnswer { | |||
int i =1; | |||
//file in/output | |||
List<String> inputList = FileUtil.readFile("E:/Linyinnian/qald6_special.txt"); | |||
List<String> inputList = FileUtil.readFile("data/test/mini-ccks.txt"); | |||
for(String input: inputList) | |||
{ | |||
if (input.length()<2 || input.charAt(0)!='q') continue; | |||
System.out.println("----------------------------------------"); | |||
System.out.println(input); | |||
ArrayList<String> outputs = new ArrayList<String>(); | |||
ArrayList<String> spqs = new ArrayList<String>(); | |||
spqs.add("id:"+String.valueOf(i)); | |||
@@ -220,9 +224,9 @@ public class GAnswer { | |||
System.out.println("Ranked Sparqls: " + qlog.rankedSparqls.size()); | |||
outputs.add(qlog.SQGlog); | |||
outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms"); | |||
outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms"); | |||
outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); | |||
// outputs.add(qlog.SQGlog + "Building HQG time: "+ (qlog.timeTable.get("step0")+qlog.timeTable.get("step1")+qlog.timeTable.get("step2")-qlog.timeTable.get("BQG_topkjoin")) + "ms"); | |||
// outputs.add("TopKjoin time: "+ qlog.timeTable.get("BQG_topkjoin") + "ms"); | |||
// outputs.add("Question Understanding time: "+ (int)(parsing_ed_time - parsing_st_time)+ "ms"); | |||
long excuting_st_time = System.currentTimeMillis(); | |||
Matches m = null; | |||
@@ -274,8 +278,10 @@ public class GAnswer { | |||
outputs.add("[" + Math.min(MAX_SPQ_NUM+1, idx) + "]" + "score=" + 1000 + "\n" + stdSPQwoPrefix + "\n"); | |||
} | |||
} | |||
else | |||
outputs.add(""); | |||
FileUtil.writeFile(outputs, "E:/Linyinnian/qald6_special_out.txt", true); | |||
FileUtil.writeFile(outputs, "data/test/mini-ccks.out", true); | |||
} | |||
} | |||
@@ -8,26 +8,18 @@ import lcn.EntityFragmentFields; | |||
import fgmt.RelationFragment; | |||
import fgmt.TypeFragment; | |||
import paradict.ParaphraseDictionary; | |||
import qa.mapping.DBpediaLookup; | |||
import nlp.tool.NERecognizer; | |||
import nlp.tool.CoreNLP; | |||
import nlp.tool.MaltParser; | |||
import nlp.tool.StanfordParser; | |||
import nlp.tool.StopWordsList; | |||
public class Globals { | |||
// nlp tools | |||
public static CoreNLP coreNLP; | |||
public static StanfordParser stanfordParser; | |||
public static StopWordsList stopWordsList; | |||
public static MaltParser maltParser; | |||
public static NERecognizer nerRecognizer; | |||
// relation paraphrase dictionary | |||
public static ParaphraseDictionary pd; | |||
// entity linking system | |||
public static DBpediaLookup dblk; | |||
public static int MaxAnswerNum = 100; | |||
public static String Dataset = "dbpedia 2016"; | |||
public static String Dataset = "pkubase"; | |||
public static String Version = "0.1.2"; | |||
public static String GDBsystem = "gStore v0.7.2"; | |||
@@ -39,34 +31,25 @@ public class Globals { | |||
public static int evaluationMethod = 2; | |||
public static String localPath = "./././"; | |||
public static String QueryEngineIP = "dbpedia16.gstore-pku.com"; // Notice, PORT number is in the evaluation function. | |||
public static String QueryEngineIP = "pkubase.gstore-pku.com"; // Notice, PORT number is in the evaluation function. | |||
public static int QueryEnginePort = 80; | |||
public static void init () | |||
{ | |||
System.out.println("====== gAnswer2.0 over DBpedia ======"); | |||
System.out.println("====== gAnswer2.0 over Pkubase ======"); | |||
long t1, t2, t3, t4, t5, t6, t7, t8, t9; | |||
t1 = System.currentTimeMillis(); | |||
coreNLP = new CoreNLP(); | |||
t2 = System.currentTimeMillis(); | |||
stanfordParser = new StanfordParser(); | |||
t3 = System.currentTimeMillis(); | |||
maltParser = new MaltParser(); | |||
t4 = System.currentTimeMillis(); | |||
nerRecognizer = new NERecognizer(); | |||
t5 = System.currentTimeMillis(); | |||
t2 = System.currentTimeMillis(); | |||
stopWordsList = new StopWordsList(); | |||
t6 = System.currentTimeMillis(); | |||
t3 = System.currentTimeMillis(); | |||
pd = new ParaphraseDictionary(); | |||
t7 = System.currentTimeMillis(); | |||
t4 = System.currentTimeMillis(); | |||
try | |||
{ | |||
EntityFragmentFields.load(); | |||
@@ -78,20 +61,13 @@ public class Globals { | |||
e1.printStackTrace(); | |||
} | |||
t8 = System.currentTimeMillis(); | |||
dblk = new DBpediaLookup(); | |||
t9 = System.currentTimeMillis(); | |||
t5 = System.currentTimeMillis(); | |||
System.out.println("======Initialization======"); | |||
System.out.println("CoreNLP(Lemma): " + (t2-t1) + "ms."); | |||
System.out.println("StanfordParser: " + (t3-t2) + "ms."); | |||
System.out.println("MaltParser: " + (t4-t3) + "ms."); | |||
System.out.println("NERecognizer: " + (t5-t4) + "ms."); | |||
System.out.println("StopWordsList: " + (t6-t5) + "ms."); | |||
System.out.println("ParaphraseDict & posTagPattern: " + (t7-t6) + "ms."); | |||
System.out.println("GraphFragments: " + (t8-t7) + "ms."); | |||
System.out.println("DBpediaLookup: " + (t9-t8) + "ms."); | |||
System.out.println("* Total *: " + (t9-t1) + "ms."); | |||
System.out.println("StanfordParser: " + (t2-t1) + "ms."); | |||
System.out.println("StopWordsList: " + (t3-t2) + "ms."); | |||
System.out.println("ParaphraseDict: " + (t4-t3) + "ms."); | |||
System.out.println("GraphFragments: " + (t5-t4) + "ms."); | |||
System.out.println("* Total *: " + (t5-t1) + "ms."); | |||
System.out.println("=========================="); | |||
} | |||
@@ -1,10 +1,11 @@ | |||
package qa; | |||
import java.util.ArrayList; | |||
import java.util.List; | |||
import nlp.ds.Sentence; | |||
import qa.extract.EntityRecognition; | |||
import rdf.MergedWord; | |||
import nlp.ds.Word; | |||
import qa.extract.EntityRecognitionCh; | |||
/** | |||
* 1. preprocessing of question | |||
@@ -21,7 +22,7 @@ public class Query | |||
public String queryId = null; | |||
public String preLog = ""; | |||
public ArrayList<MergedWord> mWordList = null; | |||
public List<Word> words = null; | |||
public Query(){} | |||
public Query(String _question) | |||
@@ -32,15 +33,17 @@ public class Query | |||
TransferedQuestion = getTransferedQuestion(NLQuestion); | |||
// step1. NODE Recognition | |||
MergedQuestionList = getMergedQuestionList(TransferedQuestion); | |||
// MergedQuestionList = getMergedQuestionList(TransferedQuestion); | |||
words = EntityRecognitionCh.parseSentAndRecogEnt(TransferedQuestion); | |||
// build Sentence | |||
sList = new ArrayList<Sentence>(); | |||
for(String mergedQuestion: MergedQuestionList) | |||
{ | |||
Sentence sentence = new Sentence(this, mergedQuestion); | |||
sList.add(sentence); | |||
} | |||
sList.add(new Sentence(words, TransferedQuestion)); // TODO: TransferedQuestion or _question | |||
// for(String mergedQuestion: MergedQuestionList) | |||
// { | |||
// Sentence sentence = new Sentence(this, mergedQuestion); | |||
// sList.add(sentence); | |||
// } | |||
} | |||
public boolean isDigit(char ch) | |||
@@ -66,6 +69,14 @@ public class Query | |||
*/ | |||
public String getTransferedQuestion(String question) | |||
{ | |||
//discard ? ! . | |||
if(question.endsWith("?") || question.endsWith("。") || question.endsWith("!")) | |||
question = question.substring(0, question.length()-1); | |||
//discard 《》 because stanford parser DO NOT recognize them. TODO: why? | |||
question = question.replace("《", "").replace("》", ""); | |||
question = question.replace("“", "").replace("”", ""); // now just discard "" because they confuse the parser. | |||
//rule1: discard ".", because "." and "_" will be disconnected by parser. Discard word tail's "'", which may pollutes NER | |||
question = question.replace("' ", " "); | |||
String [] words = question.split(" "); | |||
@@ -84,45 +95,31 @@ public class Query | |||
ret = ret.substring(0,ret.length()-1); | |||
ret = ret.replace("-", " "); | |||
ret = ret.replace("in america", "in United States"); | |||
//rule2: as well as -> and | |||
ret = ret.replace("as well as", "and"); | |||
//rule3: movie -> film | |||
ret = ret.replace(" movie", " film"); | |||
ret = ret.replace(" movies", " films"); | |||
return ret; | |||
} | |||
/** | |||
* Recognize entity & type & literal in KB and replace " " in Phrases with "_" | |||
* @param question | |||
* @return merged question list | |||
*/ | |||
public ArrayList<String> getMergedQuestionList(String question) | |||
{ | |||
ArrayList<String> mergedQuestionList = null; | |||
//entity & type recognize | |||
EntityRecognition er = new EntityRecognition(); | |||
mergedQuestionList = er.process(question); | |||
preLog = er.preLog; | |||
mWordList = er.mWordList; | |||
return mergedQuestionList; | |||
} | |||
public String removeQueryId(String question) | |||
{ | |||
String ret = question; | |||
// case 1: 1\t | |||
int st = question.indexOf("\t"); | |||
if(st!=-1 && question.length()>1 && question.charAt(0)>='0' && question.charAt(0)<='9') | |||
if(st!=-1 && question.length()>4 && isDigit(question.charAt(0))) | |||
{ | |||
queryId = question.substring(0,st); | |||
ret = question.substring(st+1); | |||
System.out.println("Extract QueryId :"+queryId); | |||
} | |||
// case 2: q1: | 1: | |||
st = question.indexOf(":"); | |||
if(st!=-1 && st<6 && question.length()>4 && (isDigit(question.charAt(0)) ||question.startsWith("q"))) | |||
{ | |||
queryId = question.substring(0,st).replace("q", ""); | |||
ret = question.substring(st+1); | |||
System.out.println("Extract QueryId :"+queryId); | |||
} | |||
return ret; | |||
} | |||
} |
@@ -1,864 +0,0 @@ | |||
package qa.extract; | |||
import java.io.BufferedReader; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.Comparator; | |||
import java.util.HashMap; | |||
import java.util.List; | |||
import fgmt.EntityFragment; | |||
import nlp.ds.Word; | |||
import qa.Globals; | |||
import rdf.EntityMapping; | |||
import rdf.NodeSelectedWithScore; | |||
import rdf.TypeMapping; | |||
import rdf.MergedWord; | |||
import utils.FileUtil; | |||
import addition.*; | |||
/** | |||
* Core class of Node Recognition | |||
* @author husen | |||
*/ | |||
public class EntityRecognition { | |||
public String preLog = ""; | |||
public String stopEntFilePath = Globals.localPath + "data/DBpedia2016/parapharse/stopEntDict.txt"; | |||
double EntAcceptedScore = 26; | |||
double TypeAcceptedScore = 0.5; | |||
double AcceptedDiffScore = 1; | |||
public ArrayList<MergedWord> mWordList = null; | |||
public ArrayList<String> stopEntList = null; | |||
public ArrayList<String> badTagListForEntAndType = null; | |||
ArrayList<ArrayList<Integer>> selectedList = null; | |||
TypeRecognition tr = null; | |||
AddtionalFix af = null; | |||
public EntityRecognition() | |||
{ | |||
// LOG | |||
preLog = ""; | |||
loadStopEntityDict(); | |||
// Bad posTag for entity | |||
badTagListForEntAndType = new ArrayList<String>(); | |||
badTagListForEntAndType.add("RBS"); | |||
badTagListForEntAndType.add("JJS"); | |||
badTagListForEntAndType.add("W"); | |||
badTagListForEntAndType.add("."); | |||
badTagListForEntAndType.add("VBD"); | |||
badTagListForEntAndType.add("VBN"); | |||
badTagListForEntAndType.add("VBZ"); | |||
badTagListForEntAndType.add("VBP"); | |||
badTagListForEntAndType.add("POS"); | |||
// Additional fix for CATEGORY (in DBpedia) | |||
af = new AddtionalFix(); | |||
tr = new TypeRecognition(); | |||
System.out.println("EntityRecognizer Initial : ok!"); | |||
} | |||
public void loadStopEntityDict() | |||
{ | |||
stopEntList = new ArrayList<String>(); | |||
try | |||
{ | |||
List<String> inputs = FileUtil.readFile(stopEntFilePath); | |||
for(String line: inputs) | |||
{ | |||
if(line.startsWith("#")) | |||
continue; | |||
stopEntList.add(line); | |||
} | |||
} | |||
catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
public ArrayList<String> process(String question) | |||
{ | |||
ArrayList<String> fixedQuestionList = new ArrayList<String>(); | |||
ArrayList<Integer> literalList = new ArrayList<Integer>(); | |||
HashMap<Integer, Double> entityScores = new HashMap<Integer, Double>(); | |||
HashMap<Integer, Integer> entityMappings = new HashMap<Integer, Integer>(); | |||
HashMap<Integer, Double> typeScores = new HashMap<Integer, Double>(); | |||
HashMap<Integer, String> typeMappings = new HashMap<Integer, String>(); | |||
HashMap<Integer, Double> mappingScores = new HashMap<Integer, Double>(); | |||
ArrayList<Integer> mustSelectedList = new ArrayList<Integer>(); | |||
System.out.println("--------- entity/type recognition start ---------"); | |||
Word[] words = Globals.coreNLP.getTaggedWords(question); | |||
mWordList = new ArrayList<MergedWord>(); | |||
long t1 = System.currentTimeMillis(); | |||
int checkEntCnt = 0, checkTypeCnt = 0, hitEntCnt = 0, hitTypeCnt = 0, allCnt = 0; | |||
boolean needRemoveCommas = false; | |||
// Check entity & type | |||
// Notice, ascending order by length | |||
StringBuilder tmpOW = new StringBuilder(); | |||
StringBuilder tmpBW = new StringBuilder(); | |||
for(int len=1; len<=words.length; len++) | |||
{ | |||
for(int st=0,ed=st+len; ed<=words.length; st++,ed++) | |||
{ | |||
String originalWord = "", baseWord = "", allUpperWord = ""; | |||
//String[] posTagArr = new String[len]; | |||
for(int j=st; j<ed; j++) | |||
{ | |||
//posTagArr[j-st] = words[j].posTag; | |||
//originalWord += words[j].originalForm; | |||
//baseWord += words[j].baseForm; | |||
tmpOW.append(words[j].originalForm); | |||
tmpBW.append(words[j].baseForm); | |||
String tmp = words[j].originalForm; | |||
if(tmp.length()>0 && tmp.charAt(0) >='a' && tmp.charAt(0)<='z') | |||
{ | |||
String pre = tmp.substring(0,1).toUpperCase(); | |||
tmp = pre + tmp.substring(1); | |||
} | |||
allUpperWord += tmp; | |||
if(j < ed-1) | |||
{ | |||
//originalWord += "_"; | |||
//baseWord += "_"; | |||
tmpOW.append("_"); | |||
tmpBW.append("_"); | |||
} | |||
} | |||
originalWord = tmpOW.toString(); | |||
baseWord=tmpBW.toString(); | |||
tmpOW.setLength(0); | |||
tmpBW.setLength(0); | |||
allCnt++; | |||
/* | |||
* Filters to speed up and drop some bad cases. | |||
*/ | |||
boolean entOmit = false, typeOmit = false; | |||
int prep_cnt=0; | |||
// Upper words can pass filter. eg: "Melbourne , Florida" | |||
int UpperWordCnt = 0; | |||
for(int i=st;i<ed;i++) | |||
if((words[i].originalForm.charAt(0)>='A' && words[i].originalForm.charAt(0)<='Z') | |||
|| ((words[i].posTag.equals(",") || words[i].originalForm.equals("'")) && i>st && i<ed-1)) | |||
UpperWordCnt++; | |||
// Filters | |||
if(UpperWordCnt<len || st==0) | |||
{ | |||
if(st==0) | |||
{ | |||
if(!words[st].posTag.startsWith("DT") && !words[st].posTag.startsWith("N")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
else if(st>0) | |||
{ | |||
Word formerWord = words[st-1]; | |||
//as princess | |||
if(formerWord.baseForm.equals("as")) | |||
entOmit = true; | |||
//how many dogs? | |||
if(formerWord.baseForm.equals("many")) | |||
entOmit = true; | |||
//obama's daughter ; your height | len=1 to avoid: Asimov's Foundation series | |||
if(len == 1 && (formerWord.posTag.startsWith("POS") || formerWord.posTag.startsWith("PRP"))) | |||
entOmit = true; | |||
//the father of you | |||
if(ed<words.length) | |||
{ | |||
Word nextWord = words[ed]; | |||
if(formerWord.posTag.equals("DT") && nextWord.posTag.equals("IN")) | |||
entOmit = true; | |||
} | |||
//the area code of ; the official language of | |||
boolean flag1=false, flag2=false; | |||
for(int i=0;i<=st;i++) | |||
if(words[i].posTag.equals("DT")) | |||
flag1 = true; | |||
for(int i=ed-1;i<words.length;i++) | |||
if(words[i].posTag.equals("IN")) | |||
flag2 = true; | |||
if(flag1 && flag2) | |||
entOmit = true; | |||
} | |||
if(ed < words.length) | |||
{ | |||
Word nextWord = words[ed]; | |||
// (lowerCase)+(UpperCase) | |||
if(nextWord.originalForm.charAt(0)>='A' && nextWord.originalForm.charAt(0)<='Z') | |||
entOmit = true; | |||
} | |||
for(int i=st;i<ed;i++) | |||
{ | |||
if(words[i].posTag.startsWith("I")) | |||
prep_cnt++; | |||
for(String badTag: badTagListForEntAndType) | |||
{ | |||
if(words[i].posTag.startsWith(badTag)) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
break; | |||
} | |||
} | |||
if(words[i].posTag.startsWith("P") && (i!=ed-1 || len==1)){ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
// First word | |||
if(i==st) | |||
{ | |||
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("EX") || words[i].posTag.startsWith("TO")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].posTag.startsWith("D") && len==2){ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].baseForm.startsWith("list") || words[i].baseForm.startsWith("many")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].baseForm.equals("and")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
// Last word. | |||
if(i==ed-1) | |||
{ | |||
if(words[i].posTag.startsWith("I") || words[i].posTag.startsWith("D") || words[i].posTag.startsWith("TO")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
if(words[i].baseForm.equals("and")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
// Single word. | |||
if(len==1) | |||
{ | |||
//TODO: Omit general noun. eg: father, book ... | |||
if(!words[i].posTag.startsWith("N")) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
} | |||
// Too many preposition. | |||
if(prep_cnt >= 3) | |||
{ | |||
entOmit = true; | |||
typeOmit = true; | |||
} | |||
} | |||
/* | |||
* Filter done. | |||
*/ | |||
// Search category | highest priority | |||
String category = null; | |||
if(af.pattern2category.containsKey(baseWord)) | |||
{ | |||
typeOmit = true; | |||
entOmit = true; | |||
category = af.pattern2category.get(baseWord); | |||
} | |||
// Search type | |||
int hitMethod = 0; // 1=dbo(baseWord), 2=dbo(originalWord), 3=yago|extend() | |||
ArrayList<TypeMapping> tmList = new ArrayList<TypeMapping>(); | |||
if(!typeOmit) | |||
{ | |||
System.out.println("Type Check: "+originalWord); | |||
//checkTypeCnt++; | |||
//search standard type | |||
tmList = tr.getTypeIDsAndNamesByStr(baseWord); | |||
if(tmList == null || tmList.size() == 0) | |||
{ | |||
tmList = tr.getTypeIDsAndNamesByStr(originalWord); | |||
if(tmList != null && tmList.size()>0) | |||
hitMethod = 2; | |||
} | |||
else | |||
hitMethod = 1; | |||
//Search extend type (YAGO type) | |||
if(tmList == null || tmList.size() == 0) | |||
{ | |||
tmList = tr.getExtendTypeByStr(allUpperWord); | |||
if(tmList != null && tmList.size() > 0) | |||
{ | |||
preLog += "++++ Extend Type detect: "+baseWord+": "+" prefferd relaiton:"+tmList.get(0).prefferdRelation+"\n"; | |||
hitMethod = 3; | |||
} | |||
} | |||
} | |||
// Search entity | |||
ArrayList<EntityMapping> emList = new ArrayList<EntityMapping>(); | |||
if(!entOmit && !stopEntList.contains(baseWord)) | |||
{ | |||
System.out.println("Ent Check: "+originalWord); | |||
checkEntCnt++; | |||
// Notice, the second parameter is whether use DBpedia Lookup. | |||
emList = getEntityIDsAndNamesByStr(originalWord, (UpperWordCnt>=len-1 || len==1),len); | |||
if(emList == null || emList.size() == 0) | |||
{ | |||
emList = getEntityIDsAndNamesByStr(baseWord, (UpperWordCnt>=len-1 || len==1), len); | |||
} | |||
if(emList!=null && emList.size()>10) | |||
{ | |||
ArrayList<EntityMapping> tmpList = new ArrayList<EntityMapping>(); | |||
for(int i=0;i<10;i++) | |||
{ | |||
tmpList.add(emList.get(i)); | |||
} | |||
emList = tmpList; | |||
} | |||
} | |||
MergedWord mWord = new MergedWord(st,ed,originalWord); | |||
// Add category | |||
if(category != null) | |||
{ | |||
mWord.mayCategory = true; | |||
mWord.category = category; | |||
int key = st*(words.length+1) + ed; | |||
mustSelectedList.add(key); | |||
} | |||
// Add literal | |||
if(len==1 && checkLiteralWord(words[st])) | |||
{ | |||
mWord.mayLiteral = true; | |||
int key = st*(words.length+1) + ed; | |||
literalList.add(key); | |||
} | |||
// Add type mappings | |||
if(tmList!=null && tmList.size()>0) | |||
{ | |||
// Drop by score threshold | |||
if(tmList.get(0).score < TypeAcceptedScore) | |||
typeOmit = true; | |||
// Only allow EXACT MATCH when method=1|2 | |||
// TODO: consider approximate match and taxonomy. eg, actor->person | |||
String likelyType = tmList.get(0).typeName.toLowerCase(); | |||
String candidateBase = baseWord.replace("_", ""), candidateOriginal = originalWord.replace("_", "").toLowerCase(); | |||
if(!candidateBase.equals(likelyType) && hitMethod == 1) | |||
typeOmit = true; | |||
if(!candidateOriginal.equals(likelyType) && hitMethod == 2) | |||
typeOmit = true; | |||
if(!typeOmit) | |||
{ | |||
mWord.mayType = true; | |||
mWord.tmList = tmList; | |||
int key = st*(words.length+1) + ed; | |||
typeMappings.put(key, tmList.get(0).typeName); | |||
typeScores.put(key, tmList.get(0).score); | |||
} | |||
} | |||
// Add entity mappings | |||
if(emList!=null && emList.size()>0) | |||
{ | |||
// Drop by score threshold | |||
if(emList.get(0).score < EntAcceptedScore) | |||
entOmit = true; | |||
// Drop: the [German Shepherd] dog | |||
else if(len > 2) | |||
{ | |||
for(int key: entityMappings.keySet()) | |||
{ | |||
//int te=key%(words.length+1); | |||
int ts=key/(words.length+1); | |||
if(ts == st+1 && ts <= ed) | |||
{ | |||
//DT in lowercase (allow uppercase, such as: [The Pillars of the Earth]) | |||
if(words[st].posTag.startsWith("DT") && !(words[st].originalForm.charAt(0)>='A'&&words[st].originalForm.charAt(0)<='Z')) | |||
{ | |||
entOmit = true; | |||
} | |||
} | |||
} | |||
} | |||
// Record info in merged word | |||
if(!entOmit) | |||
{ | |||
mWord.mayEnt = true; | |||
mWord.emList = emList; | |||
// use to remove duplicate and select | |||
int key = st*(words.length+1) + ed; | |||
entityMappings.put(key, emList.get(0).entityID); | |||
// fix entity score | conflict resolution | |||
double score = emList.get(0).score; | |||
String likelyEnt = emList.get(0).entityName.toLowerCase().replace(" ", "_"); | |||
String lowerOriginalWord = originalWord.toLowerCase(); | |||
// !Award: whole match | |||
if(likelyEnt.equals(lowerOriginalWord)) | |||
score *= len; | |||
// !Award: COVER (eg, Robert Kennedy: [Robert] [Kennedy] [Robert Kennedy]) | |||
//e.g, Social_Democratic_Party -> all ents -> drop the overlapped smaller ones | |||
//e.g, Abraham_Lincoln -> select the whole word | |||
if(len>1) | |||
{ | |||
boolean[] flag = new boolean[words.length+1]; | |||
ArrayList<Integer> needlessEntList = new ArrayList<Integer>(); | |||
double tmpScore=0; | |||
for(int preKey: entityMappings.keySet()) | |||
{ | |||
if(preKey == key) | |||
continue; | |||
int te=preKey%(words.length+1),ts=preKey/(words.length+1); | |||
for(int i=ts;i<te;i++) | |||
flag[i] = true; | |||
if(st<=ts && ed>= te) | |||
{ | |||
needlessEntList.add(preKey); | |||
tmpScore += entityScores.get(preKey); | |||
} | |||
} | |||
int hitCnt = 0; | |||
for(int i=st;i<ed;i++) | |||
if(flag[i]) | |||
hitCnt++; | |||
// WHOLE match || HIGH match & HIGH upper || WHOLE upper | |||
if(hitCnt == len || ((double)hitCnt/(double)len > 0.6 && (double)UpperWordCnt/(double)len > 0.6) || UpperWordCnt == len || len>=4) | |||
{ | |||
boolean commaTotalRight = true; | |||
if(originalWord.contains(",")) | |||
{ | |||
String candidateCompactString = originalWord.replace(",","").replace("_", "").toLowerCase(); | |||
String likelyCompactEnt = likelyEnt.replace(",","").replace("_", ""); | |||
if(!candidateCompactString.equals(likelyCompactEnt)) | |||
commaTotalRight = false; | |||
else | |||
{ | |||
mWord.name = mWord.name.replace("_,_","_"); | |||
needRemoveCommas = true; | |||
} | |||
} | |||
if(commaTotalRight) | |||
{ | |||
mustSelectedList.add(key); | |||
if(tmpScore>score) | |||
score = tmpScore+1; | |||
for(int preKey: needlessEntList) | |||
{ | |||
entityMappings.remove(preKey); | |||
mustSelectedList.remove(Integer.valueOf(preKey)); | |||
} | |||
} | |||
} | |||
} | |||
//NOTICE: score in mWord have no changes. we only change the score in entityScores. | |||
entityScores.put(key,score); | |||
} | |||
} | |||
if(mWord.mayCategory || mWord.mayEnt || mWord.mayType || mWord.mayLiteral) | |||
mWordList.add(mWord); | |||
} | |||
} | |||
/* Print all candidates (use fixed score).*/ | |||
System.out.println("------- Result ------"); | |||
for(MergedWord mWord: mWordList) | |||
{ | |||
int key = mWord.st * (words.length+1) + mWord.ed; | |||
if(mWord.mayCategory) | |||
{ | |||
System.out.println("Detect category mapping: "+mWord.name+": "+ mWord.category +" score: 100.0"); | |||
preLog += "++++ Category detect: "+mWord.name+": "+mWord.category+" score: 100.0\n"; | |||
} | |||
if(mWord.mayEnt) | |||
{ | |||
System.out.println("Detect entity mapping: "+mWord.name+": ["); | |||
for(EntityMapping em: mWord.emList) | |||
System.out.print(em.entityName + ", "); | |||
System.out.println("]"); | |||
preLog += "++++ Entity detect: "+mWord.name+": "+mWord.emList.get(0).entityName+" score:"+entityScores.get(key)+"\n"; | |||
hitEntCnt++; | |||
} | |||
if(mWord.mayType) | |||
{ | |||
System.out.println("Detect type mapping: "+mWord.name+": ["); | |||
for(TypeMapping tm: mWord.tmList) | |||
System.out.print(tm.typeName + ", "); | |||
System.out.println("]"); | |||
preLog += "++++ Type detect: "+mWord.name+": "+mWord.tmList.get(0).typeName +" score:"+typeScores.get(key)+"\n"; | |||
hitTypeCnt++; | |||
} | |||
if(mWord.mayLiteral) | |||
{ | |||
System.out.println("Detect literal: "+mWord.name); | |||
preLog += "++++ Literal detect: "+mWord.name+"\n"; | |||
} | |||
} | |||
/* | |||
* Sort by score and remove duplicate. | |||
* eg, <"video_game" "ent:Video game" "50.0"> <"a_video_game" "ent:Video game" "45.0">. | |||
* Notice, reserve all information in mWordList. | |||
*/ | |||
// one ENT maps different mergedWord in query, reserve the higher score. | |||
ByValueComparator bvc = new ByValueComparator(entityScores,words.length+1); | |||
List<Integer> keys = new ArrayList<Integer>(entityMappings.keySet()); | |||
Collections.sort(keys, bvc); | |||
for(Integer key : keys) | |||
{ | |||
if(!mappingScores.containsKey(entityMappings.get(key))) | |||
mappingScores.put(entityMappings.get(key), entityScores.get(key)); | |||
else | |||
entityMappings.remove(key); | |||
} | |||
selectedList = new ArrayList<ArrayList<Integer>>(); | |||
ArrayList<Integer> selected = new ArrayList<Integer>(); | |||
// Some phrases must be selected. | |||
selected.addAll(mustSelectedList); | |||
for(Integer key: typeMappings.keySet()) | |||
{ | |||
// !type(len>1) (Omit len=1 because: [Brooklyn Bridge] is a entity. | |||
int ed = key%(words.length+1), st = key/(words.length+1); | |||
if(st+1 < ed) | |||
{ | |||
boolean beCovered = false; | |||
//Entity cover type, eg:[prime_minister of Spain] | |||
for(int preKey: entityMappings.keySet()) | |||
{ | |||
int te=preKey%(words.length+1),ts=preKey/(words.length+1); | |||
//Entiy should longer than type | |||
if(ts <= st && te >= ed && ed-st < te-ts) | |||
{ | |||
beCovered = true; | |||
} | |||
} | |||
if(!beCovered) | |||
selected.add(key); | |||
} | |||
} | |||
// Conflict resolution | |||
ArrayList<Integer> noConflictSelected = new ArrayList<Integer>(); | |||
//select longer one when conflict | |||
boolean[] flag = new boolean[words.length]; | |||
ByLenComparator blc = new ByLenComparator(words.length+1); | |||
Collections.sort(selected,blc); | |||
for(Integer key : selected) | |||
{ | |||
int ed = key%(words.length+1), st = (key-ed)/(words.length+1); | |||
boolean omit = false; | |||
for(int i=st;i<ed;i++) | |||
{ | |||
if(flag[i]) | |||
{ | |||
omit = true; | |||
break; | |||
} | |||
} | |||
if(omit) | |||
continue; | |||
for(int i=st;i<ed;i++) | |||
flag[i]=true; | |||
noConflictSelected.add(key); | |||
} | |||
// Scoring and ranking --> top-k decision | |||
dfs(keys,0,noConflictSelected,words.length+1); | |||
ArrayList<NodeSelectedWithScore> nodeSelectedWithScoreList = new ArrayList<NodeSelectedWithScore>(); | |||
for(ArrayList<Integer> select: selectedList) | |||
{ | |||
double score = 0; | |||
for(Integer key: select) | |||
{ | |||
if(entityScores.containsKey(key)) | |||
score += entityScores.get(key); | |||
if(typeScores.containsKey(key)) | |||
score += typeScores.get(key); | |||
} | |||
NodeSelectedWithScore tmp = new NodeSelectedWithScore(select, score); | |||
nodeSelectedWithScoreList.add(tmp); | |||
} | |||
Collections.sort(nodeSelectedWithScoreList); | |||
// Replace | |||
int cnt = 0; | |||
for(int k=0; k<nodeSelectedWithScoreList.size(); k++) | |||
{ | |||
if(k >= nodeSelectedWithScoreList.size()) | |||
break; | |||
selected = nodeSelectedWithScoreList.get(k).selected; | |||
Collections.sort(selected); | |||
int j = 0; | |||
String res = question; | |||
if(selected.size()>0) | |||
{ | |||
res = words[0].originalForm; | |||
int tmp = selected.get(j++), st = tmp/(words.length+1), ed = tmp%(words.length+1); | |||
for(int i=1;i<words.length;i++) | |||
{ | |||
if(i>st && i<ed) | |||
{ | |||
res = res+"_"+words[i].originalForm; | |||
} | |||
else | |||
{ | |||
res = res+" "+words[i].originalForm; | |||
} | |||
if(i >= ed && j<selected.size()) | |||
{ | |||
tmp = selected.get(j++); | |||
st = tmp/(words.length+1); | |||
ed = tmp%(words.length+1); | |||
} | |||
} | |||
} | |||
else | |||
{ | |||
res = words[0].originalForm; | |||
for(int i=1;i<words.length;i++) | |||
{ | |||
res = res+" "+words[i].originalForm; | |||
} | |||
} | |||
boolean ok = true; | |||
for(String str: fixedQuestionList) | |||
if(str.equals(res)) | |||
ok = false; | |||
if(!ok) | |||
continue; | |||
if(needRemoveCommas) | |||
res = res.replace("_,_","_"); | |||
System.out.println("Merged: "+res); | |||
preLog += "plan "+cnt+": "+res+"\n"; | |||
fixedQuestionList.add(res); | |||
cnt++; | |||
if(cnt >= 3) // top-3 | |||
break; | |||
} | |||
long t2 = System.currentTimeMillis(); | |||
// preLog += "Total hit/check/all ent num: "+hitEntCnt+" / "+checkEntCnt+" / "+allCnt+"\n"; | |||
// preLog += "Total hit/check/all type num: "+hitTypeCnt+" / "+checkTypeCnt+" / "+allCnt+"\n"; | |||
preLog += "Node Recognition time: "+ (t2-t1) + "ms\n"; | |||
System.out.println("Total check time: "+ (t2-t1) + "ms"); | |||
System.out.println("--------- pre entity/type recognition end ---------"); | |||
return fixedQuestionList; | |||
} | |||
public void dfs(List<Integer> keys,int dep,ArrayList<Integer> selected,int size) | |||
{ | |||
if(dep == keys.size()) | |||
{ | |||
ArrayList<Integer> tmpList = (ArrayList<Integer>) selected.clone(); | |||
selectedList.add(tmpList); | |||
} | |||
else | |||
{ | |||
//off: dep-th mWord | |||
dfs(keys,dep+1,selected,size); | |||
//on: no conflict | |||
boolean conflict = false; | |||
for(int preKey: selected) | |||
{ | |||
int curKey = keys.get(dep); | |||
int preEd = preKey%size, preSt = (preKey-preEd)/size; | |||
int curEd = curKey%size, curSt = (curKey-curEd)/size; | |||
if(!(preSt<preEd && preEd<=curSt && curSt<curEd) && !(curSt<curEd && curEd<=preSt && preSt<preEd)) | |||
conflict = true; | |||
} | |||
if(!conflict) | |||
{ | |||
selected.add(keys.get(dep)); | |||
dfs(keys,dep+1,selected,size); | |||
selected.remove(keys.get(dep)); | |||
} | |||
} | |||
} | |||
public ArrayList<EntityMapping> getEntityIDsAndNamesByStr(String entity, boolean useDblk, int len) | |||
{ | |||
String n = entity; | |||
ArrayList<EntityMapping> ret= new ArrayList<EntityMapping>(); | |||
//1. Lucene index | |||
ret.addAll(EntityFragment.getEntityMappingList(n)); | |||
//2. DBpedia Lookup (some cases) | |||
if (useDblk) | |||
{ | |||
ret.addAll(Globals.dblk.getEntityMappings(n, null)); | |||
} | |||
Collections.sort(ret); | |||
if (ret.size() > 0) return ret; | |||
else return null; | |||
} | |||
public int preferDBpediaLookupOrLucene(String entityName) | |||
{ | |||
int cntUpperCase = 0; | |||
int cntSpace = 0; | |||
int cntPoint = 0; | |||
int length = entityName.length(); | |||
for (int i=0; i<length; i++) | |||
{ | |||
char c = entityName.charAt(i); | |||
if (c==' ') | |||
cntSpace++; | |||
else if (c=='.') | |||
cntPoint++; | |||
else if (c>='A' && c<='Z') | |||
cntUpperCase++; | |||
} | |||
if ((cntUpperCase>0 || cntPoint>0) && cntSpace<3) | |||
return 1; | |||
if (cntUpperCase == length) | |||
return 1; | |||
return 0; | |||
} | |||
static class ByValueComparator implements Comparator<Integer> { | |||
HashMap<Integer, Double> base_map; | |||
int base_size; | |||
double eps = 1e-8; | |||
int dblcmp(double a,double b) | |||
{ | |||
if(a+eps < b) | |||
return -1; | |||
return b+eps<a ? 1:0; | |||
} | |||
public ByValueComparator(HashMap<Integer, Double> base_map, Integer size) { | |||
this.base_map = base_map; | |||
this.base_size = size; | |||
} | |||
public int compare(Integer arg0, Integer arg1) { | |||
if (!base_map.containsKey(arg0) || !base_map.containsKey(arg1)) { | |||
return 0; | |||
} | |||
if (dblcmp(base_map.get(arg0),base_map.get(arg1))<0) { | |||
return 1; | |||
} | |||
else if (dblcmp(base_map.get(arg0),base_map.get(arg1))==0) | |||
{ | |||
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; | |||
if (len0 < len1) { | |||
return 1; | |||
} else if (len0 == len1) { | |||
return 0; | |||
} else { | |||
return -1; | |||
} | |||
} | |||
else { | |||
return -1; | |||
} | |||
} | |||
} | |||
static class ByLenComparator implements Comparator<Integer> { | |||
int base_size; | |||
public ByLenComparator(int size) { | |||
this.base_size = size; | |||
} | |||
public int compare(Integer arg0, Integer arg1) { | |||
int len0 = (arg0%base_size)-arg0/base_size , len1 = (arg1%base_size)-arg1/base_size; | |||
if (len0 < len1) { | |||
return 1; | |||
} else if (len0 == len1) { | |||
return 0; | |||
} else { | |||
return -1; | |||
} | |||
} | |||
} | |||
public boolean isDigit(char ch) | |||
{ | |||
if(ch>='0' && ch<='9') | |||
return true; | |||
return false; | |||
} | |||
//TODO: other literal words. | |||
public boolean checkLiteralWord(Word word) | |||
{ | |||
boolean ok = false; | |||
if(word.posTag.equals("CD")) | |||
ok = true; | |||
return ok; | |||
} | |||
public static void main (String[] args) | |||
{ | |||
Globals.init(); | |||
EntityRecognition er = new EntityRecognition(); | |||
try | |||
{ | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
while (true) | |||
{ | |||
System.out.println("Please input the question: "); | |||
String question = br.readLine(); | |||
er.process(question); | |||
} | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} |
@@ -0,0 +1,566 @@ | |||
package qa.extract; | |||
import java.util.ArrayList; | |||
import java.util.Collections; | |||
import java.util.Comparator; | |||
import java.util.HashMap; | |||
import java.util.List; | |||
import java.io.IOException; | |||
import java.io.BufferedReader; | |||
import java.io.InputStreamReader; | |||
import lcn.EntityFragmentFields; | |||
import com.huaban.analysis.jieba.JiebaSegmenter; | |||
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode; | |||
import com.huaban.analysis.jieba.SegToken; | |||
import edu.stanford.nlp.util.Pair; | |||
import fgmt.TypeFragment; | |||
import qa.Query; | |||
import rdf.EntityMapping; | |||
import rdf.TypeMapping; | |||
import nlp.ds.*; | |||
import utils.FileUtil; | |||
final class MODNUM | |||
{ | |||
public static int prime=9999991; | |||
} | |||
//TODO: replace by nlp.ds.word | |||
class Word | |||
{ | |||
//type:0=normal word 1=entity 2=literal(string) | |||
String word; | |||
int type; | |||
int pos=0; | |||
List<String> entList=null; | |||
Word(String w) | |||
{ | |||
word=w; | |||
type=0; | |||
} | |||
Word(String w,int i) | |||
{ | |||
word=w; | |||
type=i; | |||
} | |||
Word(String w,int i, int j) | |||
{ | |||
word=w; | |||
type=i; | |||
pos=j; | |||
} | |||
Word(String w,int i, int j,List<String> l) | |||
{ | |||
word=w; | |||
type=i; | |||
pos=j; | |||
entList=l; | |||
} | |||
} | |||
class Ent | |||
{ | |||
public final int mod=MODNUM.prime; | |||
public String entity_name,mention; | |||
public int no; | |||
public long hashe,hashm; | |||
public Ent(String load) | |||
{ | |||
int indexOf9=load.indexOf(9); | |||
if (indexOf9>=0) | |||
{ | |||
mention=load.substring(0, indexOf9); | |||
String tmp=load.substring(indexOf9+1); | |||
int t9=tmp.indexOf(9); | |||
if (t9>=0) | |||
{ | |||
entity_name=tmp.substring(0, t9); | |||
String numberStr=tmp.substring(t9+1); | |||
try | |||
{ | |||
no=Integer.valueOf(numberStr); | |||
}catch(Exception e){no=-1;}; | |||
} | |||
else entity_name=tmp; | |||
hashe=calHash(entity_name); | |||
} | |||
else | |||
{ | |||
mention=load; | |||
hashe=-1; | |||
} | |||
hashm=calHash(mention); | |||
} | |||
public long calHash(String p) | |||
{ | |||
long x=0; | |||
if (p==null || p.length()==0) return 0; | |||
for (int i=0;i<p.length();i++) | |||
{ | |||
x=x*65536+(long)(int)p.charAt(i); | |||
x=x%mod; | |||
} | |||
return x; | |||
} | |||
@Override | |||
public int hashCode() | |||
{ | |||
return (int)hashm; | |||
} | |||
public Ent(){}; | |||
} | |||
public class EntityRecognitionCh { | |||
public static HashMap<String, List<String>> entMap,nentMap; | |||
public static JiebaSegmenter segmenter = new JiebaSegmenter(); | |||
public final static int MaxEnt=20; | |||
static | |||
{ | |||
long t0 = System.currentTimeMillis(); | |||
List<String> nent = FileUtil.readFile("data/pkubase/paraphrase/ccksminutf.txt"); | |||
List<String> mention2ent = FileUtil.readFile("data/pkubase/paraphrase/mini-mention2ent.txt"); | |||
entMap=new HashMap<>(); | |||
nentMap=new HashMap<>(); | |||
System.out.println("Mention2Ent size: " + mention2ent.size()); | |||
for (String input:mention2ent) | |||
{ | |||
Ent q=new Ent(input); | |||
if (entMap.containsKey(q.mention)) | |||
entMap.get(q.mention).add(q.entity_name); | |||
else | |||
{ | |||
List<String> l=new ArrayList<>(); | |||
l.add(q.entity_name); | |||
entMap.put(q.mention, l); | |||
} | |||
} | |||
// mention: NOT ent word; entity_name: frequency | |||
for (String input:nent) | |||
{ | |||
Ent q=new Ent(input); | |||
if (nentMap.containsKey(q.mention)) | |||
nentMap.get(q.mention).add(q.entity_name); | |||
else | |||
{ | |||
List<String> l=new ArrayList<>(); | |||
l.add(q.entity_name); | |||
nentMap.put(q.mention, l); | |||
} | |||
} | |||
long t1 = System.currentTimeMillis(); | |||
System.out.println("Read Mention2Ent used "+(t1-t0)+"ms"); | |||
} | |||
public static boolean isAllNumber(String q) | |||
{ | |||
boolean ret=true; | |||
for (int i=0;i<q.length();i++) | |||
{ | |||
if (q.charAt(i)<48 || q.charAt(i)>57) return false; | |||
} | |||
return ret; | |||
} | |||
public static String longestFirst2(String Question) | |||
{ | |||
String ret=""; | |||
String input=Question.replace('{',' ').replace('}',' '); | |||
int len=input.length(); | |||
int[][] ex=new int[len+3][]; | |||
Ent[][] entx=new Ent[len+3][]; | |||
for (int i=0;i<len+2;i++) ex[i]=new int[len+3]; | |||
for (int i=0;i<len+2;i++) entx[i]=new Ent[len+3]; | |||
for (int l=1;l<=len;l++) | |||
{ | |||
int pos=0; | |||
for (int j=l-1;j<len;j++) | |||
{ | |||
String searchstr=input.substring(j-l+1,j+1); | |||
List<String> rstlist=entMap.get(searchstr); | |||
if (rstlist!=null && rstlist.size()>0) | |||
{ | |||
++pos; | |||
ex[l][pos]=j; | |||
entx[l][pos]=new Ent(searchstr); | |||
} | |||
} | |||
ex[l][0]=pos; | |||
} | |||
int covered[]=new int[len+3]; | |||
for (int l=len;l>=1;l--) | |||
{ | |||
for (int p=1;p<=ex[l][0];p++) | |||
{ | |||
int flag=1; | |||
for (int k=ex[l][p];k>=ex[l][p]-l+1;k--) if (covered[k]>0) flag=0; | |||
if (flag==1) | |||
{ | |||
//1:占用 2:词头 4:词尾 8:其他 | |||
int FLAG=0; | |||
List<String> nlist=nentMap.get(entx[l][p].mention); | |||
if (nlist!=null && nlist.size()>0) FLAG=8; | |||
if (isAllNumber(entx[l][p].mention)) FLAG=8; | |||
covered[ex[l][p]]|=4; | |||
covered[ex[l][p]-l+1]|=2; | |||
for (int k=ex[l][p];k>=ex[l][p]-l+1;k--) | |||
{ | |||
covered[k]|=1|FLAG; | |||
} | |||
} | |||
} | |||
} | |||
for (int i=0;i<len;i++) | |||
{ | |||
if ((covered[i]&2)!=0 && (covered[i]&8)==0) ret=ret+"{"; | |||
ret=ret+Question.charAt(i); | |||
if ((covered[i]&4)!=0 && (covered[i]&8)==0) ret=ret+"}"; | |||
} | |||
//System.out.println("Longest First: "+ret); | |||
//System.out.println("Time: "+(t1-t0)+"ms"); | |||
return ret; | |||
} | |||
//1->① | |||
public static String intToCircle(int i) | |||
{ | |||
if (0>i || i>20) return null; | |||
String ret=""; | |||
ret=ret+(char)(9311+i); | |||
return ret; | |||
} | |||
//①->1 | |||
public static int circleToInt(String i) | |||
{ | |||
int ret=i.charAt(0)-9311; | |||
if (0<ret&& ret<20) return ret; | |||
else return -1; | |||
} | |||
public static Pair<String,List<Word>> processedString(String s) | |||
{ | |||
List<Word> ret=new ArrayList<>(); | |||
String sentence = ""; | |||
int flag=0; | |||
String word=""; | |||
for (int i=0;i<s.length();i++) | |||
{ | |||
if (s.charAt(i)=='{') | |||
{ | |||
flag=1; | |||
continue; | |||
} | |||
if (s.charAt(i)=='}') | |||
{ | |||
if (word.length()<=2) | |||
{ | |||
sentence+=word; | |||
word=""; | |||
flag=0; | |||
continue; | |||
} | |||
int FLAG=-1; | |||
for (Word j:ret) | |||
if (word.equals(j.word)) | |||
FLAG=j.pos; | |||
if (FLAG==-1) | |||
{ | |||
flag=0; | |||
ret.add(new Word(word,1,ret.size()+1)); | |||
word=""; | |||
sentence+=intToCircle(ret.size()); | |||
continue; | |||
} | |||
else | |||
{ | |||
flag=0; | |||
word=""; | |||
sentence+=intToCircle(FLAG); | |||
continue; | |||
} | |||
} | |||
if (flag==0) sentence+=s.charAt(i); | |||
if (flag==1) word=word+s.charAt(i); | |||
} | |||
return new Pair<String,List<Word>>(sentence,ret); | |||
} | |||
public static String reprocess(List<Word> d, List<SegToken> list) | |||
{ | |||
String ret=""; | |||
int used[]=new int[list.size()+1]; | |||
int isValid[]=new int[list.size()+1]; | |||
for (int i=0;i<list.size();i++) isValid[i]=0; | |||
for(int len=4;len>=1;len--) | |||
{ | |||
for (int i=0;i<list.size()-len+1;i++) | |||
{ | |||
String tmp=""; | |||
int flag=1; | |||
for (int j=i;j<i+len;j++) | |||
{ | |||
tmp=tmp+list.get(j).word; | |||
if (tmp.length()>4) flag=0; | |||
if (circleToInt(list.get(j).word)>=0) flag=0; | |||
if (used[j]==1) flag=0; | |||
} | |||
if (flag==0) continue; | |||
List<String> rstlist=entMap.get(tmp); | |||
List<String> nlist=nentMap.get(tmp); | |||
if (nlist!=null && nlist.size()>0) | |||
{ | |||
for (int j=i;j<i+len;j++) | |||
{ | |||
used[j]=1; | |||
} | |||
} | |||
if (rstlist!=null && rstlist.size()>0 && (nlist==null||nlist.size()==0)) | |||
{ | |||
for (int j=i;j<i+len;j++) used[j]=1; | |||
int pos=-1; | |||
for (Word k:d) if (tmp.equals(k.word)) | |||
{ | |||
pos=k.pos;break; | |||
} | |||
if (pos>0) | |||
{ | |||
isValid[i]=pos; | |||
for (int j=i+1;j<i+len;j++)isValid[j]=-1; | |||
} | |||
else | |||
{ | |||
d.add(new Word(tmp,1,d.size()+1)); | |||
isValid[i]=d.size(); | |||
for (int j=i+1;j<i+len;j++)isValid[j]=-1; | |||
} | |||
} | |||
} | |||
} | |||
for (int i=0;i<list.size();i++) | |||
{ | |||
if (isValid[i]==0) | |||
{ | |||
ret=ret+list.get(i).word; | |||
} | |||
if (isValid[i]>0) | |||
{ | |||
ret=ret+intToCircle(isValid[i]); | |||
} | |||
} | |||
return ret; | |||
} | |||
public static String removeQueryId2(String question) | |||
{ | |||
String ret = question; | |||
int st = question.indexOf(":"); | |||
if(st!=-1 && st<6 && question.length()>4 && ((question.charAt(0)>='0' && question.charAt(0)<='9') ||question.charAt(0)=='q')) | |||
{ | |||
ret = question.substring(st+1); | |||
} | |||
return ret; | |||
} | |||
public static String thirdprocess(String sentence,List<Word> d) | |||
{ | |||
String temp="",rets2=""; | |||
int insyh=0; | |||
int count=0; | |||
List<Integer> lst=new ArrayList<>(); | |||
String syh=""; | |||
for (int i=0;i<sentence.length();i++) | |||
{ | |||
if (circleToInt(""+sentence.charAt(i))!=-1) | |||
{ | |||
count++; | |||
} | |||
else | |||
{ | |||
if (count>=3) | |||
{ | |||
String newent=""; | |||
for (int j=i-count;j<i;j++) | |||
{ | |||
newent+=d.get(circleToInt(""+sentence.charAt(j))-1).word; | |||
} | |||
temp+=intToCircle(d.size()); | |||
d.add(new Word(newent,2,d.size()+1)); | |||
} | |||
else | |||
for (int j=i-count;j<i;j++) | |||
{ | |||
temp+=sentence.charAt(j); | |||
} | |||
temp+=sentence.charAt(i); | |||
count=0; | |||
} | |||
} | |||
for (int i=0;i<temp.length();i++) | |||
{ | |||
if (temp.charAt(i)=='"'&&insyh==0 || temp.charAt(i)=='“') | |||
{ | |||
insyh=1; | |||
syh=""; | |||
rets2+=temp.charAt(i); | |||
} | |||
else if (temp.charAt(i)=='"'&&insyh==1 || temp.charAt(i)=='”') | |||
{ | |||
insyh=0; | |||
if (lst.size()>=1) | |||
{ | |||
String rp=""; | |||
for (int j=0;j<syh.length();j++) | |||
{ | |||
int q=circleToInt(""+syh.charAt(j)); | |||
if (q==-1) | |||
rp+=syh.charAt(j); | |||
else | |||
{ | |||
rp+=d.get(q-1).word; | |||
//ret[q]=""; | |||
} | |||
} | |||
d.add(new Word(rp,2,d.size()+1)); | |||
rets2+=intToCircle(d.size())+temp.charAt(i); | |||
} | |||
else | |||
{ | |||
rets2+=syh+temp.charAt(i); | |||
} | |||
} | |||
else if (insyh==1) | |||
{ | |||
if (circleToInt(""+temp.charAt(i))!=-1) | |||
lst.add(circleToInt(""+temp.charAt(i))); | |||
syh+=temp.charAt(i); | |||
} | |||
else | |||
rets2+=temp.charAt(i); | |||
} | |||
return rets2; | |||
} | |||
public static Pair<String,List<Word>> parse(String input, JiebaSegmenter segmenter) | |||
{ | |||
// input=removeQueryId2(input); // Remove query id before. | |||
String newinput=longestFirst2 (input); | |||
Pair<String,List<Word>> d=null,r=new Pair<String,List<Word>>(); | |||
r.second=new ArrayList<>(); | |||
try { | |||
d=processedString(newinput); | |||
} catch (Exception e) { | |||
System.out.println(e); | |||
} | |||
if (d!=null) | |||
{ | |||
//System.out.println(d.first); | |||
List<SegToken> q=segmenter.process(d.first, SegMode.SEARCH); | |||
String secondstr=""; | |||
for (SegToken t:q) | |||
{ | |||
secondstr=secondstr+t.word+","; | |||
} | |||
//System.out.println("First process: "+secondstr); | |||
String finalstring=""; | |||
String stickstr=reprocess(d.second,q); | |||
String thirdstr=thirdprocess(stickstr,d.second); | |||
List<SegToken> q2=segmenter.process(thirdstr, SegMode.SEARCH); | |||
for (SegToken t:q2) | |||
{ | |||
finalstring=finalstring+t.word+","; | |||
int p=circleToInt(""+t.word.charAt(0)); | |||
if (p!=-1) | |||
{ | |||
Word ds=d.second.get(p-1); | |||
r.second.add(new Word(ds.word,ds.type,ds.pos,entMap.get(ds.word))); | |||
} | |||
else | |||
{ | |||
r.second.add(new Word(t.word,0,-1)); | |||
} | |||
} | |||
System.out.println("Result: "+finalstring); | |||
r.first=thirdstr; | |||
return r; | |||
} | |||
else return null; | |||
} | |||
public static List<nlp.ds.Word> parseSentAndRecogEnt(String sent) | |||
{ | |||
Pair<String, List<Word>> result = parse(sent, segmenter); | |||
if(result == null) | |||
return null; | |||
List<nlp.ds.Word> words = new ArrayList<nlp.ds.Word>(); | |||
int position = 1; | |||
for(Word ow: result.second) | |||
{ | |||
// Note: jieba postag is deprecated, so we utilize stanford parser to get postag in later. | |||
nlp.ds.Word word = new nlp.ds.Word(ow.word, ow.word, null, position++); | |||
words.add(word); | |||
if(ow.type == 1 && ow.entList != null) | |||
{ | |||
// Now just consider TYPE there in a smiple way. | |||
if(TypeFragment.typeShortName2IdList.containsKey(ow.word)) | |||
{ | |||
word.mayType = true; | |||
word.tmList.add(new TypeMapping(TypeFragment.typeShortName2IdList.get(ow.word).get(0), ow.word, 100.0)); | |||
} | |||
word.mayEnt = true; | |||
word.emList = new ArrayList<EntityMapping>(); | |||
double score = 100; | |||
for(String ent: ow.entList) | |||
{ | |||
if(EntityFragmentFields.entityName2Id.containsKey(ent)) | |||
{ | |||
//TODO: consider more suitable entity score | |||
int eid = EntityFragmentFields.entityName2Id.get(ent); | |||
// String fstr = EntityFragmentFields.entityFragmentString.get(eid); | |||
// System.out.println(eid+"\t"+fstr); | |||
word.emList.add(new EntityMapping(eid, ent, score)); | |||
score -= 10; | |||
} | |||
} | |||
} | |||
else if(ow.type == 2) | |||
word.mayLiteral = true; | |||
// TODO: consider TYPE | |||
} | |||
return words; | |||
} | |||
public static void main(String[] args) throws IOException { | |||
EntityFragmentFields.load(); | |||
List<String> inputList = FileUtil.readFile("data/test/mini-ccks.txt"); | |||
for(String input: inputList) | |||
{ | |||
if (input.length()<2 || input.charAt(0)!='q') continue; | |||
System.out.println("----------------------------------------"); | |||
System.out.println(input); | |||
EntityRecognitionCh.parseSentAndRecogEnt(input); | |||
} | |||
} | |||
} | |||
@@ -19,7 +19,6 @@ import log.QueryLogger; | |||
import fgmt.EntityFragment; | |||
import fgmt.TypeFragment; | |||
import nlp.ds.Word; | |||
import nlp.tool.CoreNLP; | |||
public class ExtractImplicitRelation { | |||
@@ -374,7 +373,7 @@ public class ExtractImplicitRelation { | |||
public static void main(String[] args) throws Exception { | |||
Globals.coreNLP = new CoreNLP(); | |||
// Globals.coreNLP = new CoreNLP(); | |||
Globals.pd = new ParaphraseDictionary(); | |||
try | |||
{ | |||
@@ -28,8 +28,6 @@ public class ExtractRelation { | |||
public ArrayList<SimpleRelation> findRelationsBetweenTwoUnit(SemanticUnit su1, SemanticUnit su2, QueryLogger qlog) | |||
{ | |||
DependencyTree T = qlog.s.dependencyTreeStanford; | |||
if(qlog.isMaltParserUsed) | |||
T = qlog.s.dependencyTreeMalt; | |||
DependencyTreeNode n1 = T.getNodeByIndex(su1.centerWord.position), n2 = T.getNodeByIndex(su2.centerWord.position); | |||
ArrayList<DependencyTreeNode> shortestPath = T.getShortestNodePathBetween(n1,n2); | |||
@@ -90,15 +90,7 @@ public class TypeRecognition { | |||
if(allUpperFormWord.length() > 1 && allUpperFormWord.substring(1).equals(allUpperFormWord.substring(1).toLowerCase())) | |||
return null; | |||
//search in YAGO type | |||
if(TypeFragment.yagoTypeList.contains(allUpperFormWord)) | |||
{ | |||
//YAGO prefix | |||
String typeName = "yago:"+allUpperFormWord; | |||
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); | |||
tmList.add(tm); | |||
} | |||
else if(extendTypeMap.containsKey(allUpperFormWord)) | |||
if(extendTypeMap.containsKey(allUpperFormWord)) | |||
{ | |||
String typeName = extendTypeMap.get(allUpperFormWord); | |||
TypeMapping tm = new TypeMapping(-1,typeName,Globals.pd.typePredicateID,1); | |||
@@ -251,22 +243,22 @@ public class TypeRecognition { | |||
} | |||
} | |||
// type | |||
else if(sr.arg1Word.mayType) | |||
else if(sr.arg1Word.mayType) //TODO: type | |||
{ | |||
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries | |||
if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of")) | |||
&& !words[arg1WordPos-2].posTag.startsWith("V")) | |||
{ | |||
sr.isArg1Constant = true; | |||
double largerScore = 1000; | |||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
largerScore = sr.predicateMappings.get(0).score * 2; | |||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
sr.predicateMappings.add(0,nPredicate); | |||
//constant type should be object | |||
sr.preferredSubj = sr.arg2Word; | |||
} | |||
// if(arg1WordPos >= 2 && (words[arg1WordPos-1].baseForm.equals("in") || words[arg1WordPos-1].baseForm.equals("of")) | |||
// && !words[arg1WordPos-2].posTag.startsWith("V")) | |||
// { | |||
// sr.isArg1Constant = true; | |||
// double largerScore = 1000; | |||
// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
// largerScore = sr.predicateMappings.get(0).score * 2; | |||
// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
// sr.predicateMappings.add(0,nPredicate); | |||
// | |||
// //constant type should be object | |||
// sr.preferredSubj = sr.arg2Word; | |||
// } | |||
} | |||
//ent: constant | |||
else if(sr.arg1Word.mayEnt) | |||
@@ -297,37 +289,37 @@ public class TypeRecognition { | |||
else if(sr.arg2Word.mayType) | |||
{ | |||
//rule in/of [type] -> constant |eg, How many [countries] are there in [exT:Europe] -> ?uri rdf:type yago:EuropeanCountries | |||
if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of")) | |||
&& !words[arg2WordPos-2].posTag.startsWith("V") ) | |||
{ | |||
sr.isArg2Constant = true; | |||
double largerScore = 1000; | |||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
largerScore = sr.predicateMappings.get(0).score * 2; | |||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
sr.predicateMappings.add(0,nPredicate); | |||
sr.preferredSubj = sr.arg1Word; | |||
} | |||
// if(arg2WordPos >= 2 && (words[arg2WordPos-1].baseForm.equals("in") || words[arg2WordPos-1].baseForm.equals("of")) | |||
// && !words[arg2WordPos-2].posTag.startsWith("V") ) | |||
// { | |||
// sr.isArg2Constant = true; | |||
// double largerScore = 1000; | |||
// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
// largerScore = sr.predicateMappings.get(0).score * 2; | |||
// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
// sr.predicateMappings.add(0,nPredicate); | |||
// | |||
// sr.preferredSubj = sr.arg1Word; | |||
// } | |||
//rule: Be ... a type? | |||
if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a")) | |||
{ | |||
sr.isArg2Constant = true; | |||
double largerScore = 1000; | |||
if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
largerScore = sr.predicateMappings.get(0).score * 2; | |||
PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
sr.predicateMappings.add(0,nPredicate); | |||
sr.preferredSubj = sr.arg1Word; | |||
} | |||
// if(words[0].baseForm.equals("be") && arg2WordPos >=3 && words[arg2WordPos-1].baseForm.equals("a")) | |||
// { | |||
// sr.isArg2Constant = true; | |||
// double largerScore = 1000; | |||
// if(sr.predicateMappings!=null && sr.predicateMappings.size()>0) | |||
// largerScore = sr.predicateMappings.get(0).score * 2; | |||
// PredicateMapping nPredicate = new PredicateMapping(Globals.pd.typePredicateID, largerScore, "[type]"); | |||
// sr.predicateMappings.add(0,nPredicate); | |||
// | |||
// sr.preferredSubj = sr.arg1Word; | |||
// } | |||
} | |||
else if(sr.arg2Word.mayEnt) | |||
{ | |||
sr.isArg2Constant = true; | |||
} | |||
if(sr.arg1Word != sr.preferredSubj) | |||
if(sr.arg2Word == sr.preferredSubj) | |||
sr.swapArg1Arg2(); | |||
} | |||
} | |||
@@ -1,163 +0,0 @@ | |||
package qa.mapping; | |||
import java.io.BufferedReader; | |||
import java.io.IOException; | |||
import java.io.InputStreamReader; | |||
import java.util.ArrayList; | |||
import java.util.HashMap; | |||
import lcn.EntityFragmentFields; | |||
import log.QueryLogger; | |||
import org.apache.commons.httpclient.HttpClient; | |||
import org.apache.commons.httpclient.HttpException; | |||
import org.apache.commons.httpclient.methods.GetMethod; | |||
import fgmt.EntityFragment; | |||
import rdf.EntityMapping; | |||
public class DBpediaLookup { | |||
//There are two websites of the DBpediaLookup online service. | |||
//public static final String baseURL = "http://en.wikipedia.org/w/api.php?action=opensearch&format=xml&limit=10&search="; | |||
public static final String baseURL = "http://lookup.dbpedia.org/api/search.asmx/KeywordSearch?MaxHits=5&QueryString="; | |||
public HttpClient ctripHttpClient = null; | |||
//public static final String begin = "<Text xml:space=\"preserve\">"; | |||
//public static final String begin = "<Result>\n <Label>"; | |||
public static final String begin = "<Result>\n <Label>"; | |||
public static final int begin_length = begin.length(); | |||
//public static final String end = "</Text>"; | |||
public static final String end = "</Label>"; | |||
public static final int end_length = end.length(); | |||
public static HashMap<String, String>entMentionDict = null; // TODO: base on redirect data & wikipedia click data to build mention2ent's dictionary, now just manually | |||
public DBpediaLookup() | |||
{ | |||
ctripHttpClient = new HttpClient(); | |||
ctripHttpClient.setTimeout(3000); | |||
entMentionDict = new HashMap<String, String>(); | |||
entMentionDict.put("Prince_Charles", "Charles,_Prince_of_Wales"); | |||
} | |||
public ArrayList<EntityMapping> getEntityMappings(String searchString, QueryLogger qlog) | |||
{ | |||
ArrayList<String> slist = new ArrayList<String>(); | |||
if(entMentionDict.containsKey(searchString)) | |||
slist.add(entMentionDict.get(searchString)); | |||
else | |||
slist = lookForEntityNames(searchString, qlog); | |||
if (slist.size() == 0 && searchString.contains(". ")) | |||
slist.addAll(lookForEntityNames(searchString.replaceAll(". ", "."), qlog)); | |||
ArrayList<EntityMapping> emlist = new ArrayList<EntityMapping>(); | |||
// Now string use "_" as delimiter (original) | |||
String[] sa = searchString.split("_"); | |||
int UpperCnt = 0; | |||
for(String str: sa) | |||
{ | |||
if( (str.charAt(0)>='A'&&str.charAt(0)<='Z') || (str.charAt(0)>='0'&&str.charAt(0)<='9') ) | |||
UpperCnt ++; | |||
} | |||
System.out.print("DBpediaLookup find: " + slist + ", "); | |||
int count = 40; | |||
for (String s : slist) | |||
{ | |||
//consider ABBR only when all UPPER; drop when too long edit distance | |||
if(UpperCnt < sa.length && EntityFragment.calEditDistance(s, searchString.replace("_", ""))>searchString.length()/2) | |||
continue; | |||
int eid = -1; | |||
s = s.replace(" ", "_"); | |||
if(EntityFragmentFields.entityName2Id.containsKey(s)) | |||
{ | |||
eid = EntityFragmentFields.entityName2Id.get(s); | |||
emlist.add(new EntityMapping(eid, s, count)); | |||
count -=2 ; | |||
} | |||
else | |||
{ | |||
System.out.print("Drop "+s+" because it not in Entity Dictionary. "); | |||
} | |||
} | |||
System.out.println("DBpediaLookup select: " + emlist); | |||
return emlist; | |||
} | |||
public ArrayList<String> lookForEntityNames (String searchString, QueryLogger qlog) { | |||
// URL transition: " " -> %20 | |||
GetMethod getMethod = new GetMethod((baseURL+searchString).replaceAll(" ", "%20")); | |||
ArrayList<String> ret = new ArrayList<String>(); | |||
int statusCode; | |||
try { | |||
statusCode = ctripHttpClient.executeMethod(getMethod); | |||
} catch (HttpException e) { | |||
e.printStackTrace(); | |||
return ret; | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
return ret; | |||
} | |||
if (statusCode!=200) return null; | |||
String response = getMethod.getResponseBodyAsString(); | |||
if (qlog != null && qlog.MODE_debug) { | |||
System.out.println("searchString=" + searchString); | |||
System.out.println("statusCode=" + statusCode); | |||
System.out.println("response=" + getMethod.getResponseBodyAsString()); | |||
} | |||
getMethod.releaseConnection(); | |||
//System.out.println(response); | |||
if (response == null || response.isEmpty()) | |||
return ret; | |||
int idx1 = response.indexOf(begin); | |||
while (idx1 != -1) { | |||
int idx2 = response.indexOf(end, idx1+begin_length); | |||
String ss = response.substring(idx1+begin_length, idx2); | |||
ret.add(ss); | |||
//System.out.println(ss); | |||
idx1 = response.indexOf(begin, idx2 + end_length); | |||
} | |||
return ret; | |||
} | |||
public static void main(String argv[]){ | |||
DBpediaLookup dbplook = new DBpediaLookup(); | |||
BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); | |||
try { | |||
while (true) { | |||
System.out.println("Test DBpediaLookup."); | |||
System.out.print("Please input the search string: "); | |||
String searchString = br.readLine(); | |||
try { | |||
long t1 = System.currentTimeMillis(); | |||
ArrayList<String> res = dbplook.lookForEntityNames(searchString, null); | |||
long t2 = System.currentTimeMillis(); | |||
System.out.println(res); | |||
System.out.println("time=" + (t2-t1) + "ms"); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
} | |||
} catch (IOException e) { | |||
e.printStackTrace(); | |||
} | |||
return; | |||
} | |||
} |
@@ -37,84 +37,19 @@ public class BuildQueryGraph | |||
public BuildQueryGraph() | |||
{ | |||
whList.add("what"); | |||
whList.add("which"); | |||
whList.add("who"); | |||
whList.add("whom"); | |||
whList.add("when"); | |||
whList.add("how"); | |||
whList.add("where"); | |||
whList.add("什么"); | |||
whList.add("什么时候"); | |||
whList.add("哪些"); | |||
whList.add("哪里"); | |||
whList.add("谁"); | |||
// Bad words for NODE. (base form) | |||
// We will train a node recognition model to replace such heuristic rules further. | |||
stopNodeList.add("list"); | |||
stopNodeList.add("give"); | |||
stopNodeList.add("show"); | |||
stopNodeList.add("star"); | |||
stopNodeList.add("theme"); | |||
stopNodeList.add("world"); | |||
stopNodeList.add("independence"); | |||
stopNodeList.add("office"); | |||
stopNodeList.add("year"); | |||
stopNodeList.add("work"); | |||
} | |||
public void fixStopWord(QueryLogger qlog, DependencyTree ds) | |||
{ | |||
String qStr = qlog.s.plainText.toLowerCase(); | |||
//... [which] | |||
for(int i=2;i<qlog.s.words.length;i++) | |||
if(qlog.s.words[i].baseForm.equals("which")) | |||
stopNodeList.add(qlog.s.words[i].baseForm); | |||
//take [place] | |||
if(qStr.contains("take place") || qStr.contains("took place")) | |||
stopNodeList.add("place"); | |||
//(When was Alberta admitted) as [province] | |||
if(qStr.contains("as province")) | |||
stopNodeList.add("province"); | |||
//what form of government is found in ... | |||
if(qStr.contains("form of government")) | |||
stopNodeList.add("government"); | |||
//alma mater of the chancellor | |||
if(qStr.contains("alma mater of the chancellor")) | |||
{ | |||
stopNodeList.add("chancellor"); | |||
} | |||
//How large is the area of UK? | |||
if(qStr.contains("the area of") || qStr.contains("how big")) | |||
{ | |||
stopNodeList.add("area"); | |||
} | |||
//how much is the total population of european union? | |||
if(qStr.contains("how much")) | |||
{ | |||
stopNodeList.add("population"); | |||
stopNodeList.add("elevation"); | |||
} | |||
//when was the founding date of french fifth republic | |||
if(qStr.contains("when was the")) | |||
{ | |||
stopNodeList.add("founding"); | |||
stopNodeList.add("date"); | |||
stopNodeList.add("death"); | |||
stopNodeList.add("episode"); | |||
} | |||
if(qStr.contains("what other book")) | |||
{ | |||
stopNodeList.add("book"); | |||
} | |||
//Is [Michelle Obama] the [wife] of Barack Obama? | |||
if(qlog.s.words[0].baseForm.equals("be") && isNode(ds.getNodeByIndex(2)) && ds.getNodeByIndex(3).dep_father2child.equals("det") | |||
&& isNode(ds.getNodeByIndex(4)) && qlog.s.words[4].baseForm.equals("of")) | |||
stopNodeList.add(ds.getNodeByIndex(4).word.baseForm); | |||
stopNodeList.add("信仰"); | |||
stopNodeList.add("人"); | |||
} | |||
// Semantic Parsing for DBpedia. | |||
// Semantic Parsing for Pkubase. | |||
public ArrayList<SemanticUnit> process(QueryLogger qlog) | |||
{ | |||
try | |||
@@ -135,15 +70,15 @@ public class BuildQueryGraph | |||
* 3)Coreference resolution. | |||
* */ | |||
//0) Fix stop words | |||
fixStopWord(qlog, ds); | |||
// fixStopWord(qlog, ds); | |||
//1) Detect Modifier/Modified | |||
//rely on sentence (rather than dependency tree) | |||
//with some ADJUSTMENT (eg, ent+noun(noType&&noEnt) -> noun.omitNode=TRUE) | |||
for(Word word: qlog.s.words) | |||
getTheModifiedWordBySentence(qlog.s, word); //Find continuous modifier | |||
for(Word word: qlog.s.words) | |||
getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier | |||
// for(Word word: qlog.s.words) | |||
// getDiscreteModifiedWordBySentence(qlog.s, word); //Find discrete modifier | |||
for(Word word: qlog.s.words) | |||
if(word.modifiedWord == null) //Other words modify themselves. NOTICE: only can be called after detecting all modifier. | |||
word.modifiedWord = word; | |||
@@ -167,9 +102,9 @@ public class BuildQueryGraph | |||
qlog.target = target.word; | |||
// !target can NOT be entity. (except general question)| which [city] has most people? | |||
if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.emList!=null) | |||
// only when target.mayType=True or exist other entities. | |||
if(qlog.s.sentenceType != SentenceType.GeneralQuestion && target.word.mayEnt && target.word.mayType) | |||
{ | |||
//Counter example:Give me all Seven_Wonders_of_the_Ancient_World | (in fact, it not ENT, but CATEGORY, ?x subject Seve...) | |||
target.word.mayEnt = false; | |||
target.word.emList.clear(); | |||
} | |||
@@ -241,6 +176,17 @@ public class BuildQueryGraph | |||
curSU.neighborUnitList.add(expandSU); | |||
} | |||
} | |||
if(semanticUnitList.size() == 1 && target.word.mayEnt) | |||
{ | |||
Word[] words = qlog.s.words; | |||
SemanticUnit curSU = semanticUnitList.get(0); | |||
SemanticUnit expandSU = new SemanticUnit(words[words.length-1], false); | |||
semanticUnitList.add(expandSU); | |||
curSU.neighborUnitList.add(expandSU); | |||
expandSU.neighborUnitList.add(curSU); | |||
target = ds.getNodeByIndex(words.length); | |||
qlog.target = target.word; | |||
} | |||
qlog.timeTable.put("BQG_structure", (int)(System.currentTimeMillis()-t)); | |||
//step2: Find relations (Notice, we regard that the coreference have been resolved now) | |||
@@ -251,7 +197,7 @@ public class BuildQueryGraph | |||
qlog.timeTable.put("BQG_relation", (int)(System.currentTimeMillis()-t)); | |||
//Prepare for item mapping | |||
TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary | |||
// TypeRecognition.AddTypesOfWhwords(qlog.semanticRelations); // Type supplementary | |||
TypeRecognition.constantVariableRecognition(qlog.semanticRelations, qlog); // Constant or Variable, embedded triples | |||
//(just for display) | |||
@@ -361,7 +307,7 @@ public class BuildQueryGraph | |||
tmpRelations = new ArrayList<SimpleRelation>(); | |||
//Copy relations (for 'and', 'as soon as'...) |eg, In which films did Julia_Roberts and Richard_Gere play? | |||
//TODO: judge by dependency tree | other way to supplement relations | |||
if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals("and")) | |||
if(curSU.centerWord.position + 2 == expandSU.centerWord.position && qlog.s.words[curSU.centerWord.position].baseForm.equals("和")) | |||
{ | |||
for(SimpleRelation sr: simpleRelations) | |||
{ | |||
@@ -566,6 +512,7 @@ public class BuildQueryGraph | |||
return false; | |||
} | |||
// detect the target (question focus), also to detect some co-reference via rules. (TODO: test existing utils for co-reference resolution) | |||
public DependencyTreeNode detectTarget(DependencyTree ds, QueryLogger qlog) | |||
{ | |||
visited.clear(); | |||
@@ -583,8 +530,10 @@ public class BuildQueryGraph | |||
// No Wh-Word: use the first node; NOTICE: consider MODIFIER rules. E.g, was us president Obama ..., target=obama (rather us) | |||
if(target == null) | |||
{ | |||
for(Word word: words) | |||
//Chinese sentence: the question focus is usually in the tail. | |||
for(int i=words.length-1; i>=0; i--) | |||
{ | |||
Word word = words[i]; | |||
Word modifiedWord = word.modifiedWord; | |||
if(modifiedWord != null && isNodeCandidate(modifiedWord)) | |||
{ | |||
@@ -594,42 +543,25 @@ public class BuildQueryGraph | |||
} | |||
if(target == null) | |||
target = ds.nodesList.get(0); | |||
/* Are [E|tree_frogs] a type of [E|amphibian] , type | |||
*/ | |||
for(DependencyTreeNode dtn: target.childrenList) | |||
{ | |||
if(dtn.word.baseForm.equals("type")) | |||
{ | |||
dtn.word.represent = target.word; | |||
} | |||
} | |||
target = ds.nodesList.get(0); | |||
} | |||
//where, NOTICE: wh target from NN may not pass the function isNode() | |||
if(target.word.baseForm.equals("where")) | |||
//where | |||
if(target.word.baseForm.equals("哪里")) | |||
{ | |||
int curPos = target.word.position - 1; | |||
//!Where is the residence of | |||
if(words[curPos+1].baseForm.equals("be") && words[curPos+2].posTag.equals("DT")) | |||
//大兴安岭的[终点]是(哪里) | |||
if(curPos-2>=0 && isNodeCandidate(words[curPos-2]) && words[curPos-1].baseForm.equals("是")) | |||
{ | |||
for(int i=curPos+4;i<words.length;i++) | |||
if(words[i-1].posTag.startsWith("N") && words[i].posTag.equals("IN")) | |||
{ | |||
target.word.represent = words[i-1]; | |||
target = ds.getNodeByIndex(i); | |||
break; | |||
} | |||
target.word.represent = words[curPos-1]; | |||
target = ds.getNodeByIndex(words[curPos-1].position); | |||
} | |||
} | |||
//which | |||
if(target.word.baseForm.equals("which")) | |||
if(target.word.baseForm.equals("哪些") || target.word.baseForm.equals("哪个")) | |||
{ | |||
// test case: In which US state is Mount_McKinley located | |||
// test case: 韩国有哪些著名景点? | |||
int curPos = target.word.position-1; | |||
if(curPos+1 < words.length) | |||
{ | |||
@@ -639,27 +571,10 @@ public class BuildQueryGraph | |||
// which city ... target = city | |||
target.word.represent = word1; | |||
target = ds.getNodeByIndex(word1.position); | |||
int word1Pos = word1.position - 1; | |||
// word1 + be + (the) + word2, and be is root: word1 & word2 may coreference | |||
if(ds.root.word.baseForm.equals("be") && word1Pos+3 < words.length && words[word1Pos+1].baseForm.equals("be")) | |||
{ | |||
// which city is [the] headquarters ... | |||
Word word2 = words[word1Pos+2].modifiedWord; | |||
if(words[word1Pos+2].posTag.equals("DT")) | |||
word2 = words[word1Pos+3].modifiedWord; | |||
int word2Pos = word2.position - 1; | |||
if(word2Pos+1 < words.length && isNodeCandidate(word2) && words[word2Pos+1].posTag.startsWith("IN")) | |||
{ | |||
//In which city is [the] headquarters of ... | target = headquarters, city & headquarters: coreference | |||
//In which city was the president of Montenegro born? | COUNTER example, city & president: independent | |||
target.word.represent = word2; | |||
target = ds.getNodeByIndex(word2.position); | |||
} | |||
} | |||
} | |||
} | |||
// by dependency tree | |||
if(target.word.baseForm.equals("which")) | |||
if(target.word.baseForm.equals("哪些") || target.word.baseForm.equals("哪个")) | |||
{ | |||
//Which of <films> had the highest budget | |||
boolean ok = false; | |||
@@ -683,14 +598,14 @@ public class BuildQueryGraph | |||
} | |||
//what | |||
else if(target.word.baseForm.equals("what")) | |||
else if(target.word.baseForm.equals("什么")) | |||
{ | |||
//Detect:what is [the] sth1 prep. sth2? | |||
//Detect:龙卷风的[英文名]是(什么) | 金轮国师的(什么)[武功]有十龙十象之力? | |||
//Omit: what is sth? | |||
if(target.father != null && ds.nodesList.size()>=5) | |||
{ | |||
DependencyTreeNode tmp1 = target.father; | |||
if(tmp1.word.baseForm.equals("be")) | |||
if(tmp1.word.baseForm.equals("是")) | |||
{ | |||
for(DependencyTreeNode child: tmp1.childrenList) | |||
{ | |||
@@ -698,15 +613,13 @@ public class BuildQueryGraph | |||
continue; | |||
if(isNode(child)) | |||
{ | |||
//sth1 | |||
boolean hasPrep = false; | |||
boolean another_node = false; | |||
for(DependencyTreeNode grandson: child.childrenList) | |||
{ //prep | |||
if(grandson.dep_father2child.equals("prep")) | |||
hasPrep = true; | |||
} | |||
//Detect modifier: what is the sht1's [sth2]? | what is the largest [city]? | |||
if(hasPrep || qlog.s.hasModifier(child.word)) | |||
if(isNode(grandson)) | |||
another_node = true; | |||
//more than 2 nodes || Detect modifier: what is the sht1's [sth2]? | what is the largest [city]? | |||
if(another_node || qlog.s.hasModifier(child.word)) | |||
{ | |||
target.word.represent = child.word; | |||
target = child; | |||
@@ -715,82 +628,84 @@ public class BuildQueryGraph | |||
} | |||
} | |||
} | |||
//what sth || What airlines are (part) of the SkyTeam alliance? | |||
//what sth: 什么山高于8000米 | |||
else if(isNode(tmp1)) | |||
{ | |||
target.word.represent = tmp1.word; | |||
target = tmp1; | |||
// Coreference resolution | |||
int curPos = target.word.position - 1; | |||
if(curPos+3<words.length && words[curPos+1].baseForm.equals("be")&&words[curPos+3].posTag.startsWith("IN") && words.length > 6) | |||
{ | |||
words[curPos+2].represent = target.word; | |||
} | |||
target = tmp1; | |||
} | |||
} | |||
// by sentence | |||
if(target.word.baseForm.equals("what")) | |||
if(target.word.baseForm.equals("什么")) | |||
{ | |||
// 金轮国师的(什么)[武功]有十龙十象之力? | |||
int curPos = target.word.position - 1; | |||
// what be the [node] ... ? (Notice: words.length CONTAINS symbol(?),different from nodeList) | |||
if(words.length > 5 && words[curPos+1].baseForm.equals("be") && words[curPos+2].baseForm.equals("the") && isNodeCandidate(words[curPos+3])) | |||
if(curPos + 1 <= words.length - 1 && isNodeCandidate(words[curPos+1])) | |||
{ | |||
target.word.represent = words[curPos+3]; | |||
target = ds.getNodeByIndex(words[curPos+3].position); | |||
target.word.represent = words[curPos+1]; | |||
target = ds.getNodeByIndex(words[curPos+1].position); | |||
} | |||
} | |||
} | |||
//who | |||
else if(target.word.baseForm.equals("who")) | |||
else if(target.word.baseForm.equals("谁")) | |||
{ | |||
//Detect:who is/does [the] sth1 prep. sth2? || Who was the pope that founded the Vatican_Television ? | Who does the voice of Bart Simpson? | |||
//Detect:武汉大学的现任[校长]是(谁)? 和子女一起演过电影电视剧的[演员]有(谁)? | |||
//Others: who is sth? who do sth? | target = who | |||
//test case: Who is the daughter of Robert_Kennedy married to? | |||
if(ds.nodesList.size()>=5) | |||
{ //who | |||
for(DependencyTreeNode tmp1: ds.nodesList) | |||
{ | |||
if(tmp1 != target.father && !target.childrenList.contains(tmp1)) | |||
continue; | |||
if(tmp1.word.baseForm.equals("be") || tmp1.word.baseForm.equals("do")) | |||
{ //is | |||
for(DependencyTreeNode child: tmp1.childrenList) | |||
{ | |||
if(child == target) | |||
continue; | |||
if(isNode(child)) | |||
{ //sth1 | |||
boolean hasPrep = false; | |||
for(DependencyTreeNode grandson: child.childrenList) | |||
{ //prep | |||
if(grandson.dep_father2child.equals("prep")) | |||
hasPrep = true; | |||
} | |||
//Detect modifier: who is the sht1's sth2? | |||
// if(hasPrep || qlog.s.plainText.contains(child.word.originalForm + " 's")) // replaced by detect modifier directly | |||
if(hasPrep || qlog.s.hasModifier(child.word)) | |||
{ | |||
target.word.represent = child.word; | |||
target = child; | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
//test case: 湖上草是[谁]的(诗)? | |||
// if(ds.nodesList.size()>=5) | |||
// { //who | |||
// for(DependencyTreeNode tmp1: ds.nodesList) | |||
// { | |||
// if(tmp1 != target.father && !target.childrenList.contains(tmp1)) | |||
// continue; | |||
// if(tmp1.word.baseForm.equals("be") || tmp1.word.baseForm.equals("do")) | |||
// { //is | |||
// for(DependencyTreeNode child: tmp1.childrenList) | |||
// { | |||
// if(child == target) | |||
// continue; | |||
// if(isNode(child)) | |||
// { //sth1 | |||
// boolean hasPrep = false; | |||
// for(DependencyTreeNode grandson: child.childrenList) | |||
// { //prep | |||
// if(grandson.dep_father2child.equals("prep")) | |||
// hasPrep = true; | |||
// } | |||
// //Detect modifier: who is the sht1's sth2?if(hasPrep || qlog.s.hasModifier(child.word)) | |||
// { | |||
// target.word.represent = child.word; | |||
// target = child; | |||
// break; | |||
// } | |||
// } | |||
// } | |||
// } | |||
// } | |||
// } | |||
// by sentence | |||
if(target.word.baseForm.equals("who")) | |||
if(target.word.baseForm.equals("谁")) | |||
{ | |||
int curPos = target.word.position - 1; | |||
// who is usually coreference when it not the first word. | |||
if(curPos - 1 >= 0 && isNodeCandidate(words[curPos-1])) | |||
// [Node]是(谁) | |||
if(curPos - 2 >= 0 && isNodeCandidate(words[curPos-2])) | |||
{ | |||
target.word.represent = words[curPos-1]; | |||
target = ds.getNodeByIndex(words[curPos-1].position); | |||
// 谁 在末尾: 武汉大学的现任[校长]是(谁) | |||
if(curPos == words.length - 1 && (words[curPos-1].baseForm.equals("是") || words[curPos-1].baseForm.equals("有")) ) | |||
{ | |||
target.word.represent = words[curPos-2]; | |||
target = ds.getNodeByIndex(words[curPos-2].position); | |||
} | |||
// [湖上草]是谁的(诗) | |||
if(curPos + 2 == words.length-1 && words[curPos-1].baseForm.equals("是") | |||
&& words[curPos+1].baseForm.equals("的") && isNodeCandidate(words[curPos+2])) | |||
{ | |||
words[curPos+2].represent = words[curPos-2]; | |||
} | |||
} | |||
// Do nothing: [谁]的[女儿]嫁给了王思聪 | |||
} | |||
} | |||
//how | |||
@@ -847,7 +762,7 @@ public class BuildQueryGraph | |||
/* | |||
* There are two cases of [ent]+[type]:1、Chinese company 2、De_Beer company; | |||
* For 1, chinese -> company,for 2, De_Beer <- company | |||
* Return: True : ent -> type | False : type <- ent | |||
* Return: True : ent -> type | False : ent <- type | |||
* */ | |||
public boolean checkModifyBetweenEntType(Word entWord, Word typeWord) | |||
{ | |||
@@ -868,9 +783,9 @@ public class BuildQueryGraph | |||
* Trough sentence rather than dependency tree as the latter often incorrect | |||
* Generally a sequencial nodes always modify the last node, an exception is test case 3. So we apply recursive search method. | |||
* test case: | |||
* 1) the highest Chinese mountain | |||
* 2) the Chinese popular director | |||
* 3) the De_Beers company (company[type]-> De_Beers[ent]) | |||
* 1) 最高的中国山峰 | |||
* 2) 中国流行歌手 | |||
* 3) 谷歌公司 (company[type]-> De_Beers[ent]) | |||
* */ | |||
public Word getTheModifiedWordBySentence(Sentence s, Word curWord) | |||
{ | |||
@@ -898,14 +813,14 @@ public class BuildQueryGraph | |||
return curWord.modifiedWord = curWord; | |||
} | |||
//modify LEFT: ent + type(cur) : De_Beer company | |||
//modify LEFT: ent + type(cur) : 谷歌 公司 | |||
if(preWord != null && curWord.mayType && preWord.mayEnt) //ent + type(cur) | |||
{ | |||
if(!checkModifyBetweenEntType(preWord, curWord)) //De_Beer <- company, 注意此时即使type后面还连着node,也不理会了 | |||
return curWord.modifiedWord = preWord; | |||
} | |||
//modify itself: ent(cur) + type : De_Beer company | |||
//modify itself: ent(cur) + type : 谷歌 公司 | |||
if(nextModifiedWord != null && curWord.mayEnt && nextModifiedWord.mayType) | |||
{ | |||
if(!checkModifyBetweenEntType(curWord, nextModifiedWord)) | |||
@@ -16,36 +16,20 @@ public class QuestionParsing { | |||
} | |||
public void getDependenciesAndNER (QueryLogger qlog) { | |||
long t1 = System.currentTimeMillis(); | |||
try { | |||
long t1 = System.currentTimeMillis(); | |||
qlog.s.dependencyTreeStanford = new DependencyTree(qlog.s, Globals.stanfordParser); | |||
}catch(Exception e){ | |||
e.printStackTrace(); | |||
} | |||
long t2 = System.currentTimeMillis(); | |||
try{ | |||
qlog.s.dependencyTreeMalt = new DependencyTree(qlog.s, Globals.maltParser); | |||
}catch(Exception e){ | |||
//if errors occur, abandon malt tree | |||
qlog.s.dependencyTreeMalt = qlog.s.dependencyTreeStanford; | |||
System.err.println("MALT parser error! Use stanford parser instead."); | |||
} | |||
try { | |||
long t3 = System.currentTimeMillis(); | |||
Globals.nerRecognizer.recognize(qlog.s); | |||
long t4 = System.currentTimeMillis(); | |||
long t2 = System.currentTimeMillis(); | |||
// Globals.nerRecognizer.recognize(qlog.s); //TODO: check NER | |||
System.out.println("====StanfordDependencies("+(t2-t1)+"ms)===="); | |||
System.out.println(qlog.s.dependencyTreeStanford); | |||
System.out.println("====MaltDependencies("+(t3-t2)+"ms)===="); | |||
System.out.println(qlog.s.dependencyTreeMalt); | |||
System.out.println("====NameEntityRecognition("+(t4-t3)+"ms)===="); | |||
qlog.s.printNERResult(); | |||
// qlog.s.printNERResult(); | |||
qlog.timeTable.put("StanfordParser", (int)(t2-t1)); | |||
qlog.timeTable.put("MaltParser", (int)(t3-t2)); | |||
qlog.timeTable.put("NER", (int)(t4-t3)); | |||
} catch (Exception e) { | |||
e.printStackTrace(); | |||
} | |||
@@ -53,8 +37,7 @@ public class QuestionParsing { | |||
public void recognizeSentenceType(QueryLogger qlog) | |||
{ | |||
boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford)|| | |||
recognizeImperativeSentence(qlog.s.dependencyTreeMalt); | |||
boolean IsImperativeSentence = recognizeImperativeSentence(qlog.s.dependencyTreeStanford); | |||
if (IsImperativeSentence) | |||
{ | |||
qlog.s.sentenceType = SentenceType.ImperativeSentence; | |||
@@ -66,16 +49,14 @@ public class QuestionParsing { | |||
return; | |||
} | |||
boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford)|| | |||
recognizeSpecialQuestion(qlog.s.dependencyTreeMalt); | |||
boolean IsSpecialQuestion = recognizeSpecialQuestion(qlog.s.dependencyTreeStanford); | |||
if (IsSpecialQuestion) | |||
{ | |||
qlog.s.sentenceType = SentenceType.SpecialQuestion; | |||
return; | |||
} | |||
boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford)|| | |||
recognizeGeneralQuestion(qlog.s.dependencyTreeMalt); | |||
boolean IsGeneralQuestion = recognizeGeneralQuestion(qlog.s.dependencyTreeStanford); | |||
if (IsGeneralQuestion) | |||
{ | |||
qlog.s.sentenceType = SentenceType.GeneralQuestion; | |||
@@ -1,41 +0,0 @@ | |||
package rdf; | |||
import java.util.ArrayList; | |||
import rdf.EntityMapping; | |||
import rdf.TypeMapping; | |||
public class MergedWord implements Comparable<MergedWord> | |||
{ | |||
//original position | |||
public int st,ed; | |||
//position after merge (unselected is -1) | |||
public int mergedPos = -1; | |||
public String name; | |||
public boolean mayCategory = false; | |||
public boolean mayLiteral = false; | |||
public boolean mayEnt = false; | |||
public boolean mayType = false; | |||
public ArrayList<EntityMapping> emList = null; | |||
public ArrayList<TypeMapping> tmList = null; | |||
public String category = null; | |||
public MergedWord(int s,int e,String n) | |||
{ | |||
st = s; | |||
ed = e; | |||
name = n; | |||
} | |||
@Override | |||
//long to short | |||
public int compareTo(MergedWord o) | |||
{ | |||
int lenDiff = (this.ed-this.st) - (o.ed-o.st); | |||
if (lenDiff > 0) return -1; | |||
else if (lenDiff < 0) return 1; | |||
return 0; | |||
} | |||
} |
@@ -65,7 +65,7 @@ public class SimpleRelation { | |||
} | |||
sumSelectivity = matchingScore*sumSelectivity*pidsup.support; | |||
int pid = pidsup.predicateID; | |||
if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; | |||
// if (Globals.pd.dbo_predicate_id.contains(pid)) sumSelectivity *= 1.5; | |||
if (!pasList.containsKey(pid)) | |||
pasList.put(pid, sumSelectivity); | |||