@@ -0,0 +1,28 @@ | |||
#encoding=utf-8 | |||
basic = [] | |||
yago = [] | |||
b = 0 | |||
y = 100000 | |||
''' | |||
In dbpedia dataset we use two sorts of type: yago type and basic type | |||
yago type refers to type with yago prefix | |||
basic type refers to objects pointed to by rdf:type | |||
this script divide this two kinds of types into different files. | |||
''' | |||
with open('type id file here') as f: | |||
for line in f: | |||
dou = line[:-1].split('\t') | |||
if dou[0][:6] == '<yago:': | |||
yago.append(dou[0]+"\t%d\n"%y) | |||
y+=1 | |||
else: | |||
basic.append(dou[0]+"\t%d\n"%b) | |||
b+=1 | |||
with open('basic types id file here','w') as f: | |||
for str in basic: | |||
f.write(str) | |||
with open("yago type id file here",'w') as f: | |||
for str in yago: | |||
f.write(str) | |||
@@ -0,0 +1,65 @@ | |||
import re | |||
''' | |||
Step 1: Clean the triple file. In the dbpedia case, we just need the part of resource URI that indicate entity/type/predicate names. | |||
''' | |||
fileName = []#List of triple files to be process | |||
notRdf = open('./notRdf.txt','w')#Record the lines that refers to a type but not rdf:type | |||
for index2,fname in enumerate(fileName): | |||
f = open('./'+fname) | |||
triple = open('output triple files here','w') | |||
prefix_f = open('output prefix files here','w')# save the prefix in files in case of it may be useful in the future. | |||
i = 0 | |||
count = 0 | |||
prefix_set = set() | |||
for line in f: | |||
if line[0] != '<': | |||
print(i) | |||
i = i + 1 | |||
count += 1 | |||
continue | |||
line = line[:-3].replace('> <','>$-$-$<').replace('> "','>$-$-$"') | |||
line = line.split('$-$-$') | |||
if i==0: | |||
i += 1 | |||
continue | |||
new_line=[] | |||
if "type>" in line[1]: | |||
if "rdf" not in line[1]: | |||
notRdf.write(str(line)+'\n') | |||
continue | |||
for index,item in enumerate(line): | |||
if not item: | |||
count +=1 | |||
break | |||
if item[0]=='<': | |||
pos = item.rfind('/') | |||
word = item[pos+1:-1].split("#") | |||
if len(word)<2: | |||
new_line.append('<'+word[0]+'>') | |||
else: | |||
new_line.append('<'+word[1]+'>') | |||
if index == 1: | |||
tmp = new_line[1][1:len(new_line[1])-1] | |||
pos2 = line[1].rfind(tmp) | |||
prefix = line[1][1:pos2-1] | |||
prefix_set.add(tmp + '^^^'+prefix+'\n') | |||
continue | |||
elif item.count('"') >=2: | |||
item = item.split('^^')[0].split('@')[0] | |||
pattern = re.compile('"(.*)"') | |||
word = '"'+''.join(pattern.findall(item))+'"' | |||
new_line.append(word) | |||
continue | |||
else: | |||
print(i) | |||
i += 1 | |||
#print('\t'.join(new_line)) | |||
if i%1000000==0: | |||
print("%d:%d"%(8,i)) | |||
triple.write('\t'.join(new_line)+'\n') | |||
for item in prefix_set: | |||
prefix_f.write(item) | |||
f.close() | |||
triple.close() | |||
prefix_f.close() | |||
@@ -0,0 +1,31 @@ | |||
# encoding=utf-8 | |||
''' | |||
Step2: remove the dubplicate triples. | |||
''' | |||
triples = set() | |||
j = 1 | |||
i = 1 | |||
with open('./pkubase/pkubase-triples.txt','r') as f: | |||
while 1: | |||
line = f.readline() | |||
if not line: | |||
break | |||
triples.add(line) | |||
if j % 100000 == 0: | |||
print("%d:%d"%(i,j)) | |||
j += 1 | |||
j = 1 | |||
i = 2 | |||
with open('./pkubase/pkubase-types.txt','r') as f: | |||
while 1: | |||
line = f.readline() | |||
if not line: | |||
break | |||
triples.add(line) | |||
if j % 100000 == 0: | |||
print("%d:%d"%(i,j)) | |||
j += 1 | |||
print(len(triples)) | |||
wf = open('./pkubase/pkubase_clean.txt','w') | |||
for item in triples: | |||
wf.write(item) |
@@ -0,0 +1,58 @@ | |||
# encoding=utf-8 | |||
''' | |||
Step3: extract entity, type and predicate out of the original triple files and allocate ids | |||
''' | |||
entities = set() | |||
types = set() | |||
predicate = set() | |||
with open('triple file here','r') as f: | |||
i = 1 | |||
k = 0 | |||
for line in f.readlines(): | |||
tri = line[:-2].split('\t') | |||
entities.add(tri[0]) | |||
predicate.add(tri[1]) | |||
if len(tri)==2: | |||
print("%s:%d"%(line,i)) | |||
i += 1 | |||
k += 1 | |||
print(tri) | |||
continue | |||
if '"' in tri[2][0] or '"' in tri[2][0]: | |||
continue | |||
entities.add(tri[2]) | |||
if tri[1]=='<type>': | |||
types.add(tri[2]) | |||
if i%10000 == 0: | |||
print(i) | |||
i += 1 | |||
print(i) | |||
print(k) | |||
e = open('entity id file','w') | |||
t = open('type id file','w') | |||
p = open('predicate id file','w') | |||
k = 0 | |||
for item in entities: | |||
if item[-1]!='\n': | |||
e.write(item+'\t%d'%k+'\n') | |||
else: | |||
e.write(item[:-1]+'\t%d'%k+'\n') | |||
k += 1 | |||
k = 0 | |||
for item in types: | |||
if item[-1]!='\n': | |||
t.write(item+'\t%d'%k+'\n') | |||
else: | |||
t.write(item[:-1]+'\t%d'%k+'\n') | |||
k += 1 | |||
k = 0 | |||
for item in predicate: | |||
if item[-1]!='\n': | |||
p.write(item+'\t%d'%k+'\n') | |||
else: | |||
p.write(item[:-1]+'\t%d'%k+'\n') | |||
k += 1 |
@@ -0,0 +1,54 @@ | |||
#encoding=utf-8 | |||
''' | |||
Step4: transform the triples and represent entity, type and predicate with id | |||
''' | |||
eid = {} | |||
tid = {} | |||
pid = {} | |||
with open('entity id file here','r') as e: | |||
for line in e: | |||
dub = line[:-1].split('\t') | |||
eid[dub[0]] = dub[1] | |||
with open('type id file here','r') as t: | |||
for line in t: | |||
dub = line[:-1].split('\t') | |||
tid[dub[0]] = dub[1] | |||
with open('predicate id file here','r') as p: | |||
for line in p: | |||
dub = line[:-1].split('\t') | |||
pid[dub[0]] = dub[1] | |||
print("%d %d %d"%(len(eid),len(tid),len(pid))) | |||
rt = open("output triple file here",'w') | |||
with open('input triple file here','r') as f: | |||
i = 1; | |||
for line in f: | |||
tri = line[:-2].split('\t') | |||
if tri[1] == '<type>': | |||
if not tid.has_key(tri[2]): | |||
tid[tri[2]] = '-1' | |||
try: | |||
rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],tid[tri[2]])) | |||
except KeyError: | |||
print(line) | |||
print(i) | |||
else: | |||
if tri[2][0]=='"': | |||
try: | |||
rt.write("%s\t%s\t-1\n"%(eid[tri[0]],pid[tri[1]])) | |||
except KeyError: | |||
print(line) | |||
print(i) | |||
else: | |||
try: | |||
rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],eid[tri[2]])) | |||
except KeyError: | |||
print(line) | |||
print(i) | |||
@@ -0,0 +1,132 @@ | |||
#encoding=utf-8 | |||
inEnEdge = {} | |||
outEnEdge = {} | |||
inEdge={} | |||
outEdge = {} | |||
types = {} | |||
with open('triple file represented by ids here','r') as f: | |||
i = 1 | |||
for line in f: | |||
tri = line[:-1].split('\t') | |||
if tri[1] == 'id of <type>' and tri[2]!='-1': | |||
if types.has_key(tri[0]): | |||
types[tri[0]].add(tri[2]) | |||
else: | |||
types[tri[0]] = set() | |||
types[tri[0]].add(tri[2]) | |||
else: | |||
if outEdge.has_key(tri[0]): | |||
outEdge[tri[0]].add(tri[1]) | |||
else: | |||
outEdge[tri[0]] = set() | |||
outEdge[tri[0]].add(tri[1]) | |||
if tri[2]!='-1': | |||
if outEnEdge.has_key(tri[0]): | |||
if outEnEdge[tri[0]].has_key(tri[2]): | |||
outEnEdge[tri[0]][tri[2]].add(tri[1]) | |||
else: | |||
outEnEdge[tri[0]][tri[2]] = set() | |||
outEnEdge[tri[0]][tri[2]].add(tri[1]) | |||
else: | |||
outEnEdge[tri[0]]={} | |||
outEnEdge[tri[0]][tri[2]] = set() | |||
outEnEdge[tri[0]][tri[2]].add(tri[1]) | |||
if inEdge.has_key(tri[2]): | |||
inEdge[tri[2]].add(tri[1]) | |||
else: | |||
inEdge[tri[2]] = set() | |||
inEdge[tri[2]].add(tri[1]) | |||
if inEnEdge.has_key(tri[2]): | |||
if inEnEdge[tri[2]].has_key(tri[0]): | |||
inEnEdge[tri[2]][tri[0]].add(tri[1]) | |||
else: | |||
inEnEdge[tri[2]][tri[0]] = set() | |||
inEnEdge[tri[2]][tri[0]].add(tri[1]) | |||
else: | |||
inEnEdge[tri[2]] = {} | |||
inEnEdge[tri[2]][tri[0]] = set() | |||
inEnEdge[tri[2]][tri[0]].add(tri[1]) | |||
if i%10000 == 0: | |||
print(i) | |||
i += 1 | |||
print(len(inEnEdge)) | |||
print(len(outEnEdge)) | |||
print(len(inEdge)) | |||
print(len(outEdge)) | |||
print(len(types)) | |||
wr = open('output fragment file','w') | |||
for i in range(12301050):#here we should iterate every entitiy | |||
if i%10000 == 0: | |||
print(i) | |||
eid = "%d"%i | |||
ret = "" | |||
tmp = "" | |||
if inEnEdge.has_key(eid): | |||
tmp = "" | |||
for k in inEnEdge[eid].keys(): | |||
tmp += k | |||
tmp += ':' | |||
for item in inEnEdge[eid][k]: | |||
if item == '-1': | |||
continue | |||
tmp += item + ';' | |||
tmp += ',' | |||
ret += tmp | |||
tmp = "" | |||
ret += '|' | |||
if outEnEdge.has_key(eid): | |||
tmp = "" | |||
for k in outEnEdge[eid].keys(): | |||
tmp += k | |||
tmp += ':' | |||
for item in outEnEdge[eid][k]: | |||
if item == '-1': | |||
continue | |||
tmp += item + ';' | |||
tmp += ',' | |||
ret += tmp | |||
tmp = "" | |||
ret += '|' | |||
if inEdge.has_key(eid): | |||
tmp = "" | |||
for item in inEdge[eid]: | |||
if item == '-1': | |||
continue | |||
tmp += item + ',' | |||
ret += tmp | |||
tmp="" | |||
ret += '|' | |||
if outEdge.has_key(eid): | |||
tmp = "" | |||
for item in outEdge[eid]: | |||
if item == '-1': | |||
continue | |||
tmp += item + ',' | |||
ret += tmp | |||
tmp="" | |||
ret += '|' | |||
if types.has_key(eid): | |||
tmp = "" | |||
for item in types[eid]: | |||
if item == '-1': | |||
continue | |||
tmp += item + ',' | |||
ret += tmp | |||
tmp="" | |||
wr.write("%s\t%s\n"%(eid,ret)) | |||
@@ -0,0 +1,40 @@ | |||
#encoding=utf-8 | |||
en2t = {} | |||
with open('input entity fragment file here','r') as f: | |||
for line in f: | |||
dou = line[:-1].split('\t') | |||
types = dou[1].replace('|','#').split('#')[4] | |||
typeset = types.split(',') | |||
en2t[dou[0]] = set() | |||
for t in typeset: | |||
if len(t)<6 and t!='-1' and len(t)>0: | |||
en2t[dou[0]].add(t) | |||
print("en2t loaded\n") | |||
lisen = {} | |||
for i in range(26043):#iterate every basic type | |||
lisen['%d'%i] = [set(),set(),set()] | |||
with open('triple file represented by ids here','r') as f: | |||
i = 1 | |||
for line in f: | |||
if i%100000 == 0: | |||
print(i) | |||
i += 1 | |||
tri = line[:-1].split('\t') | |||
if tri[1]!='208518': | |||
for t in en2t[tri[0]]: | |||
if len(t)<=5: | |||
lisen[t][1].add(tri[1]) | |||
lisen[t][2].add(tri[0]) | |||
if tri[2]!='-1': | |||
for t in en2t[tri[2]]: | |||
if len(t)<=5: | |||
lisen[t][0].add(tri[1]) | |||
lisen[t][2].add(tri[2]) | |||
with open('output type fragment','w') as f: | |||
for k in lisen.keys(): | |||
f.write(k+'\t'+','.join(lisen[k][0])+'|'+','.join(lisen[k][1])+'|'+','.join(lisen[k][2])+'\n') | |||
print(len(lisen)) | |||
@@ -0,0 +1,43 @@ | |||
#encoding=utf-8 | |||
en2t = {} | |||
with open('input entity fragment','r') as f: | |||
for line in f: | |||
dou = line[:-1].split('\t') | |||
types = dou[1].replace('|','#').split('#')[4] | |||
typeset = types.split(',') | |||
en2t[dou[0]] = set() | |||
for t in typeset: | |||
if len(t)<6 and t!='-1' and len(t)>0: | |||
en2t[dou[0]].add(t) | |||
sen = set() | |||
lisen = {} | |||
for i in range(408261):#iterate every predicate | |||
lisen['%d'%i] = set() | |||
with open('triple file represented by ids here','r') as f: | |||
i = 1 | |||
for line in f: | |||
if i%100000==0: | |||
print(i) | |||
tri = line[:-1].split('\t') | |||
if tri[0]!='-1': | |||
pre = '['+','.join(en2t[tri[0]])+']' | |||
else: | |||
pre = '[]' | |||
if tri[2]!='-1': | |||
pos = '['+','.join(en2t[tri[2]])+']\n' | |||
str = pre + '\t' + tri[1] + '\t' + pos | |||
sen.add(str) | |||
else: | |||
lisen[tri[1]].add(tri[0]) | |||
for k in lisen.keys(): | |||
str = '['+','.join(lisen[k])+']\t'+k+'\tliteral\n' | |||
sen.add(str) | |||
with open('output predicate fragment file','w') as f: | |||
for item in sen: | |||
f.write(item) | |||
print(len(sen)) | |||