@@ -0,0 +1,28 @@ | |||||
#encoding=utf-8 | |||||
basic = [] | |||||
yago = [] | |||||
b = 0 | |||||
y = 100000 | |||||
''' | |||||
In dbpedia dataset we use two sorts of type: yago type and basic type | |||||
yago type refers to type with yago prefix | |||||
basic type refers to objects pointed to by rdf:type | |||||
this script divide this two kinds of types into different files. | |||||
''' | |||||
with open('type id file here') as f: | |||||
for line in f: | |||||
dou = line[:-1].split('\t') | |||||
if dou[0][:6] == '<yago:': | |||||
yago.append(dou[0]+"\t%d\n"%y) | |||||
y+=1 | |||||
else: | |||||
basic.append(dou[0]+"\t%d\n"%b) | |||||
b+=1 | |||||
with open('basic types id file here','w') as f: | |||||
for str in basic: | |||||
f.write(str) | |||||
with open("yago type id file here",'w') as f: | |||||
for str in yago: | |||||
f.write(str) | |||||
@@ -0,0 +1,65 @@ | |||||
import re | |||||
''' | |||||
Step 1: Clean the triple file. In the dbpedia case, we just need the part of resource URI that indicate entity/type/predicate names. | |||||
''' | |||||
fileName = []#List of triple files to be process | |||||
notRdf = open('./notRdf.txt','w')#Record the lines that refers to a type but not rdf:type | |||||
for index2,fname in enumerate(fileName): | |||||
f = open('./'+fname) | |||||
triple = open('output triple files here','w') | |||||
prefix_f = open('output prefix files here','w')# save the prefix in files in case of it may be useful in the future. | |||||
i = 0 | |||||
count = 0 | |||||
prefix_set = set() | |||||
for line in f: | |||||
if line[0] != '<': | |||||
print(i) | |||||
i = i + 1 | |||||
count += 1 | |||||
continue | |||||
line = line[:-3].replace('> <','>$-$-$<').replace('> "','>$-$-$"') | |||||
line = line.split('$-$-$') | |||||
if i==0: | |||||
i += 1 | |||||
continue | |||||
new_line=[] | |||||
if "type>" in line[1]: | |||||
if "rdf" not in line[1]: | |||||
notRdf.write(str(line)+'\n') | |||||
continue | |||||
for index,item in enumerate(line): | |||||
if not item: | |||||
count +=1 | |||||
break | |||||
if item[0]=='<': | |||||
pos = item.rfind('/') | |||||
word = item[pos+1:-1].split("#") | |||||
if len(word)<2: | |||||
new_line.append('<'+word[0]+'>') | |||||
else: | |||||
new_line.append('<'+word[1]+'>') | |||||
if index == 1: | |||||
tmp = new_line[1][1:len(new_line[1])-1] | |||||
pos2 = line[1].rfind(tmp) | |||||
prefix = line[1][1:pos2-1] | |||||
prefix_set.add(tmp + '^^^'+prefix+'\n') | |||||
continue | |||||
elif item.count('"') >=2: | |||||
item = item.split('^^')[0].split('@')[0] | |||||
pattern = re.compile('"(.*)"') | |||||
word = '"'+''.join(pattern.findall(item))+'"' | |||||
new_line.append(word) | |||||
continue | |||||
else: | |||||
print(i) | |||||
i += 1 | |||||
#print('\t'.join(new_line)) | |||||
if i%1000000==0: | |||||
print("%d:%d"%(8,i)) | |||||
triple.write('\t'.join(new_line)+'\n') | |||||
for item in prefix_set: | |||||
prefix_f.write(item) | |||||
f.close() | |||||
triple.close() | |||||
prefix_f.close() | |||||
@@ -0,0 +1,31 @@ | |||||
# encoding=utf-8 | |||||
''' | |||||
Step2: remove the dubplicate triples. | |||||
''' | |||||
triples = set() | |||||
j = 1 | |||||
i = 1 | |||||
with open('./pkubase/pkubase-triples.txt','r') as f: | |||||
while 1: | |||||
line = f.readline() | |||||
if not line: | |||||
break | |||||
triples.add(line) | |||||
if j % 100000 == 0: | |||||
print("%d:%d"%(i,j)) | |||||
j += 1 | |||||
j = 1 | |||||
i = 2 | |||||
with open('./pkubase/pkubase-types.txt','r') as f: | |||||
while 1: | |||||
line = f.readline() | |||||
if not line: | |||||
break | |||||
triples.add(line) | |||||
if j % 100000 == 0: | |||||
print("%d:%d"%(i,j)) | |||||
j += 1 | |||||
print(len(triples)) | |||||
wf = open('./pkubase/pkubase_clean.txt','w') | |||||
for item in triples: | |||||
wf.write(item) |
@@ -0,0 +1,58 @@ | |||||
# encoding=utf-8 | |||||
''' | |||||
Step3: extract entity, type and predicate out of the original triple files and allocate ids | |||||
''' | |||||
entities = set() | |||||
types = set() | |||||
predicate = set() | |||||
with open('triple file here','r') as f: | |||||
i = 1 | |||||
k = 0 | |||||
for line in f.readlines(): | |||||
tri = line[:-2].split('\t') | |||||
entities.add(tri[0]) | |||||
predicate.add(tri[1]) | |||||
if len(tri)==2: | |||||
print("%s:%d"%(line,i)) | |||||
i += 1 | |||||
k += 1 | |||||
print(tri) | |||||
continue | |||||
if '"' in tri[2][0] or '"' in tri[2][0]: | |||||
continue | |||||
entities.add(tri[2]) | |||||
if tri[1]=='<type>': | |||||
types.add(tri[2]) | |||||
if i%10000 == 0: | |||||
print(i) | |||||
i += 1 | |||||
print(i) | |||||
print(k) | |||||
e = open('entity id file','w') | |||||
t = open('type id file','w') | |||||
p = open('predicate id file','w') | |||||
k = 0 | |||||
for item in entities: | |||||
if item[-1]!='\n': | |||||
e.write(item+'\t%d'%k+'\n') | |||||
else: | |||||
e.write(item[:-1]+'\t%d'%k+'\n') | |||||
k += 1 | |||||
k = 0 | |||||
for item in types: | |||||
if item[-1]!='\n': | |||||
t.write(item+'\t%d'%k+'\n') | |||||
else: | |||||
t.write(item[:-1]+'\t%d'%k+'\n') | |||||
k += 1 | |||||
k = 0 | |||||
for item in predicate: | |||||
if item[-1]!='\n': | |||||
p.write(item+'\t%d'%k+'\n') | |||||
else: | |||||
p.write(item[:-1]+'\t%d'%k+'\n') | |||||
k += 1 |
@@ -0,0 +1,54 @@ | |||||
#encoding=utf-8 | |||||
''' | |||||
Step4: transform the triples and represent entity, type and predicate with id | |||||
''' | |||||
eid = {} | |||||
tid = {} | |||||
pid = {} | |||||
with open('entity id file here','r') as e: | |||||
for line in e: | |||||
dub = line[:-1].split('\t') | |||||
eid[dub[0]] = dub[1] | |||||
with open('type id file here','r') as t: | |||||
for line in t: | |||||
dub = line[:-1].split('\t') | |||||
tid[dub[0]] = dub[1] | |||||
with open('predicate id file here','r') as p: | |||||
for line in p: | |||||
dub = line[:-1].split('\t') | |||||
pid[dub[0]] = dub[1] | |||||
print("%d %d %d"%(len(eid),len(tid),len(pid))) | |||||
rt = open("output triple file here",'w') | |||||
with open('input triple file here','r') as f: | |||||
i = 1; | |||||
for line in f: | |||||
tri = line[:-2].split('\t') | |||||
if tri[1] == '<type>': | |||||
if not tid.has_key(tri[2]): | |||||
tid[tri[2]] = '-1' | |||||
try: | |||||
rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],tid[tri[2]])) | |||||
except KeyError: | |||||
print(line) | |||||
print(i) | |||||
else: | |||||
if tri[2][0]=='"': | |||||
try: | |||||
rt.write("%s\t%s\t-1\n"%(eid[tri[0]],pid[tri[1]])) | |||||
except KeyError: | |||||
print(line) | |||||
print(i) | |||||
else: | |||||
try: | |||||
rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],eid[tri[2]])) | |||||
except KeyError: | |||||
print(line) | |||||
print(i) | |||||
@@ -0,0 +1,132 @@ | |||||
#encoding=utf-8 | |||||
inEnEdge = {} | |||||
outEnEdge = {} | |||||
inEdge={} | |||||
outEdge = {} | |||||
types = {} | |||||
with open('triple file represented by ids here','r') as f: | |||||
i = 1 | |||||
for line in f: | |||||
tri = line[:-1].split('\t') | |||||
if tri[1] == 'id of <type>' and tri[2]!='-1': | |||||
if types.has_key(tri[0]): | |||||
types[tri[0]].add(tri[2]) | |||||
else: | |||||
types[tri[0]] = set() | |||||
types[tri[0]].add(tri[2]) | |||||
else: | |||||
if outEdge.has_key(tri[0]): | |||||
outEdge[tri[0]].add(tri[1]) | |||||
else: | |||||
outEdge[tri[0]] = set() | |||||
outEdge[tri[0]].add(tri[1]) | |||||
if tri[2]!='-1': | |||||
if outEnEdge.has_key(tri[0]): | |||||
if outEnEdge[tri[0]].has_key(tri[2]): | |||||
outEnEdge[tri[0]][tri[2]].add(tri[1]) | |||||
else: | |||||
outEnEdge[tri[0]][tri[2]] = set() | |||||
outEnEdge[tri[0]][tri[2]].add(tri[1]) | |||||
else: | |||||
outEnEdge[tri[0]]={} | |||||
outEnEdge[tri[0]][tri[2]] = set() | |||||
outEnEdge[tri[0]][tri[2]].add(tri[1]) | |||||
if inEdge.has_key(tri[2]): | |||||
inEdge[tri[2]].add(tri[1]) | |||||
else: | |||||
inEdge[tri[2]] = set() | |||||
inEdge[tri[2]].add(tri[1]) | |||||
if inEnEdge.has_key(tri[2]): | |||||
if inEnEdge[tri[2]].has_key(tri[0]): | |||||
inEnEdge[tri[2]][tri[0]].add(tri[1]) | |||||
else: | |||||
inEnEdge[tri[2]][tri[0]] = set() | |||||
inEnEdge[tri[2]][tri[0]].add(tri[1]) | |||||
else: | |||||
inEnEdge[tri[2]] = {} | |||||
inEnEdge[tri[2]][tri[0]] = set() | |||||
inEnEdge[tri[2]][tri[0]].add(tri[1]) | |||||
if i%10000 == 0: | |||||
print(i) | |||||
i += 1 | |||||
print(len(inEnEdge)) | |||||
print(len(outEnEdge)) | |||||
print(len(inEdge)) | |||||
print(len(outEdge)) | |||||
print(len(types)) | |||||
wr = open('output fragment file','w') | |||||
for i in range(12301050):#here we should iterate every entitiy | |||||
if i%10000 == 0: | |||||
print(i) | |||||
eid = "%d"%i | |||||
ret = "" | |||||
tmp = "" | |||||
if inEnEdge.has_key(eid): | |||||
tmp = "" | |||||
for k in inEnEdge[eid].keys(): | |||||
tmp += k | |||||
tmp += ':' | |||||
for item in inEnEdge[eid][k]: | |||||
if item == '-1': | |||||
continue | |||||
tmp += item + ';' | |||||
tmp += ',' | |||||
ret += tmp | |||||
tmp = "" | |||||
ret += '|' | |||||
if outEnEdge.has_key(eid): | |||||
tmp = "" | |||||
for k in outEnEdge[eid].keys(): | |||||
tmp += k | |||||
tmp += ':' | |||||
for item in outEnEdge[eid][k]: | |||||
if item == '-1': | |||||
continue | |||||
tmp += item + ';' | |||||
tmp += ',' | |||||
ret += tmp | |||||
tmp = "" | |||||
ret += '|' | |||||
if inEdge.has_key(eid): | |||||
tmp = "" | |||||
for item in inEdge[eid]: | |||||
if item == '-1': | |||||
continue | |||||
tmp += item + ',' | |||||
ret += tmp | |||||
tmp="" | |||||
ret += '|' | |||||
if outEdge.has_key(eid): | |||||
tmp = "" | |||||
for item in outEdge[eid]: | |||||
if item == '-1': | |||||
continue | |||||
tmp += item + ',' | |||||
ret += tmp | |||||
tmp="" | |||||
ret += '|' | |||||
if types.has_key(eid): | |||||
tmp = "" | |||||
for item in types[eid]: | |||||
if item == '-1': | |||||
continue | |||||
tmp += item + ',' | |||||
ret += tmp | |||||
tmp="" | |||||
wr.write("%s\t%s\n"%(eid,ret)) | |||||
@@ -0,0 +1,40 @@ | |||||
#encoding=utf-8 | |||||
en2t = {} | |||||
with open('input entity fragment file here','r') as f: | |||||
for line in f: | |||||
dou = line[:-1].split('\t') | |||||
types = dou[1].replace('|','#').split('#')[4] | |||||
typeset = types.split(',') | |||||
en2t[dou[0]] = set() | |||||
for t in typeset: | |||||
if len(t)<6 and t!='-1' and len(t)>0: | |||||
en2t[dou[0]].add(t) | |||||
print("en2t loaded\n") | |||||
lisen = {} | |||||
for i in range(26043):#iterate every basic type | |||||
lisen['%d'%i] = [set(),set(),set()] | |||||
with open('triple file represented by ids here','r') as f: | |||||
i = 1 | |||||
for line in f: | |||||
if i%100000 == 0: | |||||
print(i) | |||||
i += 1 | |||||
tri = line[:-1].split('\t') | |||||
if tri[1]!='208518': | |||||
for t in en2t[tri[0]]: | |||||
if len(t)<=5: | |||||
lisen[t][1].add(tri[1]) | |||||
lisen[t][2].add(tri[0]) | |||||
if tri[2]!='-1': | |||||
for t in en2t[tri[2]]: | |||||
if len(t)<=5: | |||||
lisen[t][0].add(tri[1]) | |||||
lisen[t][2].add(tri[2]) | |||||
with open('output type fragment','w') as f: | |||||
for k in lisen.keys(): | |||||
f.write(k+'\t'+','.join(lisen[k][0])+'|'+','.join(lisen[k][1])+'|'+','.join(lisen[k][2])+'\n') | |||||
print(len(lisen)) | |||||
@@ -0,0 +1,43 @@ | |||||
#encoding=utf-8 | |||||
en2t = {} | |||||
with open('input entity fragment','r') as f: | |||||
for line in f: | |||||
dou = line[:-1].split('\t') | |||||
types = dou[1].replace('|','#').split('#')[4] | |||||
typeset = types.split(',') | |||||
en2t[dou[0]] = set() | |||||
for t in typeset: | |||||
if len(t)<6 and t!='-1' and len(t)>0: | |||||
en2t[dou[0]].add(t) | |||||
sen = set() | |||||
lisen = {} | |||||
for i in range(408261):#iterate every predicate | |||||
lisen['%d'%i] = set() | |||||
with open('triple file represented by ids here','r') as f: | |||||
i = 1 | |||||
for line in f: | |||||
if i%100000==0: | |||||
print(i) | |||||
tri = line[:-1].split('\t') | |||||
if tri[0]!='-1': | |||||
pre = '['+','.join(en2t[tri[0]])+']' | |||||
else: | |||||
pre = '[]' | |||||
if tri[2]!='-1': | |||||
pos = '['+','.join(en2t[tri[2]])+']\n' | |||||
str = pre + '\t' + tri[1] + '\t' + pos | |||||
sen.add(str) | |||||
else: | |||||
lisen[tri[1]].add(tri[0]) | |||||
for k in lisen.keys(): | |||||
str = '['+','.join(lisen[k])+']\t'+k+'\tliteral\n' | |||||
sen.add(str) | |||||
with open('output predicate fragment file','w') as f: | |||||
for item in sen: | |||||
f.write(item) | |||||
print(len(sen)) | |||||