diff --git a/genrate_fragments/extra_get_basic_and_yago.py b/genrate_fragments/extra_get_basic_and_yago.py new file mode 100644 index 0000000..cf77895 --- /dev/null +++ b/genrate_fragments/extra_get_basic_and_yago.py @@ -0,0 +1,28 @@ +#encoding=utf-8 +basic = [] +yago = [] +b = 0 +y = 100000 +''' + In dbpedia dataset we use two sorts of type: yago type and basic type + yago type refers to type with yago prefix + basic type refers to objects pointed to by rdf:type + this script divide this two kinds of types into different files. +''' +with open('type id file here') as f: + for line in f: + dou = line[:-1].split('\t') + if dou[0][:6] == ' <','>$-$-$<').replace('> "','>$-$-$"') + line = line.split('$-$-$') + if i==0: + i += 1 + continue + new_line=[] + if "type>" in line[1]: + if "rdf" not in line[1]: + notRdf.write(str(line)+'\n') + continue + for index,item in enumerate(line): + if not item: + count +=1 + break + if item[0]=='<': + pos = item.rfind('/') + word = item[pos+1:-1].split("#") + if len(word)<2: + new_line.append('<'+word[0]+'>') + else: + new_line.append('<'+word[1]+'>') + if index == 1: + tmp = new_line[1][1:len(new_line[1])-1] + pos2 = line[1].rfind(tmp) + prefix = line[1][1:pos2-1] + prefix_set.add(tmp + '^^^'+prefix+'\n') + continue + elif item.count('"') >=2: + item = item.split('^^')[0].split('@')[0] + pattern = re.compile('"(.*)"') + word = '"'+''.join(pattern.findall(item))+'"' + new_line.append(word) + continue + else: + print(i) + i += 1 + #print('\t'.join(new_line)) + if i%1000000==0: + print("%d:%d"%(8,i)) + triple.write('\t'.join(new_line)+'\n') + for item in prefix_set: + prefix_f.write(item) + f.close() + triple.close() + prefix_f.close() + diff --git a/genrate_fragments/step2_dedubplicate.py b/genrate_fragments/step2_dedubplicate.py new file mode 100644 index 0000000..f01bf2a --- /dev/null +++ b/genrate_fragments/step2_dedubplicate.py @@ -0,0 +1,31 @@ +# encoding=utf-8 +''' +Step2: remove the dubplicate triples. +''' +triples = set() +j = 1 +i = 1 +with open('./pkubase/pkubase-triples.txt','r') as f: + while 1: + line = f.readline() + if not line: + break + triples.add(line) + if j % 100000 == 0: + print("%d:%d"%(i,j)) + j += 1 +j = 1 +i = 2 +with open('./pkubase/pkubase-types.txt','r') as f: + while 1: + line = f.readline() + if not line: + break + triples.add(line) + if j % 100000 == 0: + print("%d:%d"%(i,j)) + j += 1 +print(len(triples)) +wf = open('./pkubase/pkubase_clean.txt','w') +for item in triples: + wf.write(item) diff --git a/genrate_fragments/step3_split.py b/genrate_fragments/step3_split.py new file mode 100644 index 0000000..fb3954f --- /dev/null +++ b/genrate_fragments/step3_split.py @@ -0,0 +1,58 @@ +# encoding=utf-8 +''' +Step3: extract entity, type and predicate out of the original triple files and allocate ids +''' +entities = set() +types = set() +predicate = set() +with open('triple file here','r') as f: + i = 1 + k = 0 + for line in f.readlines(): + tri = line[:-2].split('\t') + entities.add(tri[0]) + predicate.add(tri[1]) + if len(tri)==2: + print("%s:%d"%(line,i)) + i += 1 + k += 1 + print(tri) + continue + if '"' in tri[2][0] or '"' in tri[2][0]: + continue + entities.add(tri[2]) + if tri[1]=='': + types.add(tri[2]) + if i%10000 == 0: + print(i) + i += 1 + print(i) + print(k) + +e = open('entity id file','w') +t = open('type id file','w') +p = open('predicate id file','w') + +k = 0 +for item in entities: + if item[-1]!='\n': + e.write(item+'\t%d'%k+'\n') + else: + e.write(item[:-1]+'\t%d'%k+'\n') + k += 1 + +k = 0 +for item in types: + if item[-1]!='\n': + t.write(item+'\t%d'%k+'\n') + else: + t.write(item[:-1]+'\t%d'%k+'\n') + k += 1 + +k = 0 +for item in predicate: + if item[-1]!='\n': + p.write(item+'\t%d'%k+'\n') + else: + p.write(item[:-1]+'\t%d'%k+'\n') + k += 1 diff --git a/genrate_fragments/step4_triple_to_number.py b/genrate_fragments/step4_triple_to_number.py new file mode 100644 index 0000000..a4cc2d2 --- /dev/null +++ b/genrate_fragments/step4_triple_to_number.py @@ -0,0 +1,54 @@ +#encoding=utf-8 +''' +Step4: transform the triples and represent entity, type and predicate with id +''' +eid = {} +tid = {} +pid = {} + +with open('entity id file here','r') as e: + for line in e: + dub = line[:-1].split('\t') + eid[dub[0]] = dub[1] + + +with open('type id file here','r') as t: + for line in t: + dub = line[:-1].split('\t') + tid[dub[0]] = dub[1] + + +with open('predicate id file here','r') as p: + for line in p: + dub = line[:-1].split('\t') + pid[dub[0]] = dub[1] + +print("%d %d %d"%(len(eid),len(tid),len(pid))) + +rt = open("output triple file here",'w') +with open('input triple file here','r') as f: + i = 1; + for line in f: + tri = line[:-2].split('\t') + if tri[1] == '': + if not tid.has_key(tri[2]): + tid[tri[2]] = '-1' + try: + rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],tid[tri[2]])) + except KeyError: + print(line) + print(i) + else: + if tri[2][0]=='"': + try: + rt.write("%s\t%s\t-1\n"%(eid[tri[0]],pid[tri[1]])) + except KeyError: + print(line) + print(i) + else: + try: + rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],eid[tri[2]])) + except KeyError: + print(line) + print(i) + diff --git a/genrate_fragments/step5_get_entity_fragment.py b/genrate_fragments/step5_get_entity_fragment.py new file mode 100644 index 0000000..df2265d --- /dev/null +++ b/genrate_fragments/step5_get_entity_fragment.py @@ -0,0 +1,132 @@ +#encoding=utf-8 +inEnEdge = {} +outEnEdge = {} +inEdge={} +outEdge = {} +types = {} +with open('triple file represented by ids here','r') as f: + i = 1 + for line in f: + tri = line[:-1].split('\t') + + if tri[1] == 'id of ' and tri[2]!='-1': + if types.has_key(tri[0]): + types[tri[0]].add(tri[2]) + else: + types[tri[0]] = set() + types[tri[0]].add(tri[2]) + else: + if outEdge.has_key(tri[0]): + outEdge[tri[0]].add(tri[1]) + else: + outEdge[tri[0]] = set() + outEdge[tri[0]].add(tri[1]) + + if tri[2]!='-1': + if outEnEdge.has_key(tri[0]): + if outEnEdge[tri[0]].has_key(tri[2]): + outEnEdge[tri[0]][tri[2]].add(tri[1]) + else: + outEnEdge[tri[0]][tri[2]] = set() + outEnEdge[tri[0]][tri[2]].add(tri[1]) + else: + outEnEdge[tri[0]]={} + outEnEdge[tri[0]][tri[2]] = set() + outEnEdge[tri[0]][tri[2]].add(tri[1]) + + if inEdge.has_key(tri[2]): + inEdge[tri[2]].add(tri[1]) + else: + inEdge[tri[2]] = set() + inEdge[tri[2]].add(tri[1]) + if inEnEdge.has_key(tri[2]): + if inEnEdge[tri[2]].has_key(tri[0]): + inEnEdge[tri[2]][tri[0]].add(tri[1]) + else: + inEnEdge[tri[2]][tri[0]] = set() + inEnEdge[tri[2]][tri[0]].add(tri[1]) + else: + inEnEdge[tri[2]] = {} + inEnEdge[tri[2]][tri[0]] = set() + inEnEdge[tri[2]][tri[0]].add(tri[1]) + if i%10000 == 0: + print(i) + i += 1 +print(len(inEnEdge)) +print(len(outEnEdge)) +print(len(inEdge)) +print(len(outEdge)) +print(len(types)) +wr = open('output fragment file','w') +for i in range(12301050):#here we should iterate every entitiy + if i%10000 == 0: + print(i) + eid = "%d"%i + ret = "" + tmp = "" + if inEnEdge.has_key(eid): + tmp = "" + for k in inEnEdge[eid].keys(): + tmp += k + tmp += ':' + for item in inEnEdge[eid][k]: + if item == '-1': + continue + tmp += item + ';' + tmp += ',' + ret += tmp + tmp = "" + ret += '|' + + if outEnEdge.has_key(eid): + tmp = "" + for k in outEnEdge[eid].keys(): + tmp += k + tmp += ':' + for item in outEnEdge[eid][k]: + if item == '-1': + continue + tmp += item + ';' + tmp += ',' + ret += tmp + tmp = "" + ret += '|' + + if inEdge.has_key(eid): + tmp = "" + for item in inEdge[eid]: + if item == '-1': + continue + tmp += item + ',' + ret += tmp + tmp="" + ret += '|' + + if outEdge.has_key(eid): + tmp = "" + for item in outEdge[eid]: + if item == '-1': + continue + tmp += item + ',' + ret += tmp + tmp="" + ret += '|' + + if types.has_key(eid): + tmp = "" + for item in types[eid]: + if item == '-1': + continue + tmp += item + ',' + ret += tmp + tmp="" + wr.write("%s\t%s\n"%(eid,ret)) + + + + + + + + + diff --git a/genrate_fragments/step6_get_type_fragment.py b/genrate_fragments/step6_get_type_fragment.py new file mode 100644 index 0000000..0c7794a --- /dev/null +++ b/genrate_fragments/step6_get_type_fragment.py @@ -0,0 +1,40 @@ +#encoding=utf-8 +en2t = {} +with open('input entity fragment file here','r') as f: + for line in f: + dou = line[:-1].split('\t') + types = dou[1].replace('|','#').split('#')[4] + typeset = types.split(',') + en2t[dou[0]] = set() + for t in typeset: + if len(t)<6 and t!='-1' and len(t)>0: + en2t[dou[0]].add(t) +print("en2t loaded\n") +lisen = {} +for i in range(26043):#iterate every basic type + lisen['%d'%i] = [set(),set(),set()] + +with open('triple file represented by ids here','r') as f: + i = 1 + for line in f: + if i%100000 == 0: + print(i) + i += 1 + tri = line[:-1].split('\t') + if tri[1]!='208518': + for t in en2t[tri[0]]: + if len(t)<=5: + lisen[t][1].add(tri[1]) + lisen[t][2].add(tri[0]) + if tri[2]!='-1': + for t in en2t[tri[2]]: + if len(t)<=5: + lisen[t][0].add(tri[1]) + lisen[t][2].add(tri[2]) + +with open('output type fragment','w') as f: + for k in lisen.keys(): + f.write(k+'\t'+','.join(lisen[k][0])+'|'+','.join(lisen[k][1])+'|'+','.join(lisen[k][2])+'\n') + print(len(lisen)) + + diff --git a/genrate_fragments/step7_get_predicate_fragment.py b/genrate_fragments/step7_get_predicate_fragment.py new file mode 100644 index 0000000..bc6a268 --- /dev/null +++ b/genrate_fragments/step7_get_predicate_fragment.py @@ -0,0 +1,43 @@ +#encoding=utf-8 +en2t = {} +with open('input entity fragment','r') as f: + for line in f: + dou = line[:-1].split('\t') + types = dou[1].replace('|','#').split('#')[4] + typeset = types.split(',') + en2t[dou[0]] = set() + for t in typeset: + if len(t)<6 and t!='-1' and len(t)>0: + en2t[dou[0]].add(t) +sen = set() +lisen = {} +for i in range(408261):#iterate every predicate + lisen['%d'%i] = set() + +with open('triple file represented by ids here','r') as f: + i = 1 + for line in f: + if i%100000==0: + print(i) + tri = line[:-1].split('\t') + if tri[0]!='-1': + pre = '['+','.join(en2t[tri[0]])+']' + else: + pre = '[]' + if tri[2]!='-1': + pos = '['+','.join(en2t[tri[2]])+']\n' + str = pre + '\t' + tri[1] + '\t' + pos + sen.add(str) + else: + lisen[tri[1]].add(tri[0]) + +for k in lisen.keys(): + str = '['+','.join(lisen[k])+']\t'+k+'\tliteral\n' + sen.add(str) + +with open('output predicate fragment file','w') as f: + for item in sen: + f.write(item) + print(len(sen)) + +