Browse Source

Add files via upload

pkubase
Nick Lin GitHub 6 years ago
parent
commit
725a1305ed
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 451 additions and 0 deletions
  1. +28
    -0
      genrate_fragments/extra_get_basic_and_yago.py
  2. +65
    -0
      genrate_fragments/step1_clean_triple.py
  3. +31
    -0
      genrate_fragments/step2_dedubplicate.py
  4. +58
    -0
      genrate_fragments/step3_split.py
  5. +54
    -0
      genrate_fragments/step4_triple_to_number.py
  6. +132
    -0
      genrate_fragments/step5_get_entity_fragment.py
  7. +40
    -0
      genrate_fragments/step6_get_type_fragment.py
  8. +43
    -0
      genrate_fragments/step7_get_predicate_fragment.py

+ 28
- 0
genrate_fragments/extra_get_basic_and_yago.py View File

@@ -0,0 +1,28 @@
#encoding=utf-8
basic = []
yago = []
b = 0
y = 100000
'''
In dbpedia dataset we use two sorts of type: yago type and basic type
yago type refers to type with yago prefix
basic type refers to objects pointed to by rdf:type
this script divide this two kinds of types into different files.
'''
with open('type id file here') as f:
for line in f:
dou = line[:-1].split('\t')
if dou[0][:6] == '<yago:':
yago.append(dou[0]+"\t%d\n"%y)
y+=1
else:
basic.append(dou[0]+"\t%d\n"%b)
b+=1
with open('basic types id file here','w') as f:
for str in basic:
f.write(str)
with open("yago type id file here",'w') as f:
for str in yago:
f.write(str)

+ 65
- 0
genrate_fragments/step1_clean_triple.py View File

@@ -0,0 +1,65 @@
import re
'''
Step 1: Clean the triple file. In the dbpedia case, we just need the part of resource URI that indicate entity/type/predicate names.
'''
fileName = []#List of triple files to be process
notRdf = open('./notRdf.txt','w')#Record the lines that refers to a type but not rdf:type
for index2,fname in enumerate(fileName):
f = open('./'+fname)
triple = open('output triple files here','w')
prefix_f = open('output prefix files here','w')# save the prefix in files in case of it may be useful in the future.
i = 0
count = 0
prefix_set = set()
for line in f:
if line[0] != '<':
print(i)
i = i + 1
count += 1
continue
line = line[:-3].replace('> <','>$-$-$<').replace('> "','>$-$-$"')
line = line.split('$-$-$')
if i==0:
i += 1
continue
new_line=[]
if "type>" in line[1]:
if "rdf" not in line[1]:
notRdf.write(str(line)+'\n')
continue
for index,item in enumerate(line):
if not item:
count +=1
break
if item[0]=='<':
pos = item.rfind('/')
word = item[pos+1:-1].split("#")
if len(word)<2:
new_line.append('<'+word[0]+'>')
else:
new_line.append('<'+word[1]+'>')
if index == 1:
tmp = new_line[1][1:len(new_line[1])-1]
pos2 = line[1].rfind(tmp)
prefix = line[1][1:pos2-1]
prefix_set.add(tmp + '^^^'+prefix+'\n')
continue
elif item.count('"') >=2:
item = item.split('^^')[0].split('@')[0]
pattern = re.compile('"(.*)"')
word = '"'+''.join(pattern.findall(item))+'"'
new_line.append(word)
continue
else:
print(i)
i += 1
#print('\t'.join(new_line))
if i%1000000==0:
print("%d:%d"%(8,i))
triple.write('\t'.join(new_line)+'\n')
for item in prefix_set:
prefix_f.write(item)
f.close()
triple.close()
prefix_f.close()

+ 31
- 0
genrate_fragments/step2_dedubplicate.py View File

@@ -0,0 +1,31 @@
# encoding=utf-8
'''
Step2: remove the dubplicate triples.
'''
triples = set()
j = 1
i = 1
with open('./pkubase/pkubase-triples.txt','r') as f:
while 1:
line = f.readline()
if not line:
break
triples.add(line)
if j % 100000 == 0:
print("%d:%d"%(i,j))
j += 1
j = 1
i = 2
with open('./pkubase/pkubase-types.txt','r') as f:
while 1:
line = f.readline()
if not line:
break
triples.add(line)
if j % 100000 == 0:
print("%d:%d"%(i,j))
j += 1
print(len(triples))
wf = open('./pkubase/pkubase_clean.txt','w')
for item in triples:
wf.write(item)

+ 58
- 0
genrate_fragments/step3_split.py View File

@@ -0,0 +1,58 @@
# encoding=utf-8
'''
Step3: extract entity, type and predicate out of the original triple files and allocate ids
'''
entities = set()
types = set()
predicate = set()
with open('triple file here','r') as f:
i = 1
k = 0
for line in f.readlines():
tri = line[:-2].split('\t')
entities.add(tri[0])
predicate.add(tri[1])
if len(tri)==2:
print("%s:%d"%(line,i))
i += 1
k += 1
print(tri)
continue
if '"' in tri[2][0] or '"' in tri[2][0]:
continue
entities.add(tri[2])
if tri[1]=='<type>':
types.add(tri[2])
if i%10000 == 0:
print(i)
i += 1
print(i)
print(k)
e = open('entity id file','w')
t = open('type id file','w')
p = open('predicate id file','w')
k = 0
for item in entities:
if item[-1]!='\n':
e.write(item+'\t%d'%k+'\n')
else:
e.write(item[:-1]+'\t%d'%k+'\n')
k += 1
k = 0
for item in types:
if item[-1]!='\n':
t.write(item+'\t%d'%k+'\n')
else:
t.write(item[:-1]+'\t%d'%k+'\n')
k += 1
k = 0
for item in predicate:
if item[-1]!='\n':
p.write(item+'\t%d'%k+'\n')
else:
p.write(item[:-1]+'\t%d'%k+'\n')
k += 1

+ 54
- 0
genrate_fragments/step4_triple_to_number.py View File

@@ -0,0 +1,54 @@
#encoding=utf-8
'''
Step4: transform the triples and represent entity, type and predicate with id
'''
eid = {}
tid = {}
pid = {}
with open('entity id file here','r') as e:
for line in e:
dub = line[:-1].split('\t')
eid[dub[0]] = dub[1]
with open('type id file here','r') as t:
for line in t:
dub = line[:-1].split('\t')
tid[dub[0]] = dub[1]
with open('predicate id file here','r') as p:
for line in p:
dub = line[:-1].split('\t')
pid[dub[0]] = dub[1]
print("%d %d %d"%(len(eid),len(tid),len(pid)))
rt = open("output triple file here",'w')
with open('input triple file here','r') as f:
i = 1;
for line in f:
tri = line[:-2].split('\t')
if tri[1] == '<type>':
if not tid.has_key(tri[2]):
tid[tri[2]] = '-1'
try:
rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],tid[tri[2]]))
except KeyError:
print(line)
print(i)
else:
if tri[2][0]=='"':
try:
rt.write("%s\t%s\t-1\n"%(eid[tri[0]],pid[tri[1]]))
except KeyError:
print(line)
print(i)
else:
try:
rt.write("%s\t%s\t%s\n"%(eid[tri[0]],pid[tri[1]],eid[tri[2]]))
except KeyError:
print(line)
print(i)

+ 132
- 0
genrate_fragments/step5_get_entity_fragment.py View File

@@ -0,0 +1,132 @@
#encoding=utf-8
inEnEdge = {}
outEnEdge = {}
inEdge={}
outEdge = {}
types = {}
with open('triple file represented by ids here','r') as f:
i = 1
for line in f:
tri = line[:-1].split('\t')
if tri[1] == 'id of <type>' and tri[2]!='-1':
if types.has_key(tri[0]):
types[tri[0]].add(tri[2])
else:
types[tri[0]] = set()
types[tri[0]].add(tri[2])
else:
if outEdge.has_key(tri[0]):
outEdge[tri[0]].add(tri[1])
else:
outEdge[tri[0]] = set()
outEdge[tri[0]].add(tri[1])
if tri[2]!='-1':
if outEnEdge.has_key(tri[0]):
if outEnEdge[tri[0]].has_key(tri[2]):
outEnEdge[tri[0]][tri[2]].add(tri[1])
else:
outEnEdge[tri[0]][tri[2]] = set()
outEnEdge[tri[0]][tri[2]].add(tri[1])
else:
outEnEdge[tri[0]]={}
outEnEdge[tri[0]][tri[2]] = set()
outEnEdge[tri[0]][tri[2]].add(tri[1])
if inEdge.has_key(tri[2]):
inEdge[tri[2]].add(tri[1])
else:
inEdge[tri[2]] = set()
inEdge[tri[2]].add(tri[1])
if inEnEdge.has_key(tri[2]):
if inEnEdge[tri[2]].has_key(tri[0]):
inEnEdge[tri[2]][tri[0]].add(tri[1])
else:
inEnEdge[tri[2]][tri[0]] = set()
inEnEdge[tri[2]][tri[0]].add(tri[1])
else:
inEnEdge[tri[2]] = {}
inEnEdge[tri[2]][tri[0]] = set()
inEnEdge[tri[2]][tri[0]].add(tri[1])
if i%10000 == 0:
print(i)
i += 1
print(len(inEnEdge))
print(len(outEnEdge))
print(len(inEdge))
print(len(outEdge))
print(len(types))
wr = open('output fragment file','w')
for i in range(12301050):#here we should iterate every entitiy
if i%10000 == 0:
print(i)
eid = "%d"%i
ret = ""
tmp = ""
if inEnEdge.has_key(eid):
tmp = ""
for k in inEnEdge[eid].keys():
tmp += k
tmp += ':'
for item in inEnEdge[eid][k]:
if item == '-1':
continue
tmp += item + ';'
tmp += ','
ret += tmp
tmp = ""
ret += '|'
if outEnEdge.has_key(eid):
tmp = ""
for k in outEnEdge[eid].keys():
tmp += k
tmp += ':'
for item in outEnEdge[eid][k]:
if item == '-1':
continue
tmp += item + ';'
tmp += ','
ret += tmp
tmp = ""
ret += '|'
if inEdge.has_key(eid):
tmp = ""
for item in inEdge[eid]:
if item == '-1':
continue
tmp += item + ','
ret += tmp
tmp=""
ret += '|'
if outEdge.has_key(eid):
tmp = ""
for item in outEdge[eid]:
if item == '-1':
continue
tmp += item + ','
ret += tmp
tmp=""
ret += '|'
if types.has_key(eid):
tmp = ""
for item in types[eid]:
if item == '-1':
continue
tmp += item + ','
ret += tmp
tmp=""
wr.write("%s\t%s\n"%(eid,ret))

+ 40
- 0
genrate_fragments/step6_get_type_fragment.py View File

@@ -0,0 +1,40 @@
#encoding=utf-8
en2t = {}
with open('input entity fragment file here','r') as f:
for line in f:
dou = line[:-1].split('\t')
types = dou[1].replace('|','#').split('#')[4]
typeset = types.split(',')
en2t[dou[0]] = set()
for t in typeset:
if len(t)<6 and t!='-1' and len(t)>0:
en2t[dou[0]].add(t)
print("en2t loaded\n")
lisen = {}
for i in range(26043):#iterate every basic type
lisen['%d'%i] = [set(),set(),set()]
with open('triple file represented by ids here','r') as f:
i = 1
for line in f:
if i%100000 == 0:
print(i)
i += 1
tri = line[:-1].split('\t')
if tri[1]!='208518':
for t in en2t[tri[0]]:
if len(t)<=5:
lisen[t][1].add(tri[1])
lisen[t][2].add(tri[0])
if tri[2]!='-1':
for t in en2t[tri[2]]:
if len(t)<=5:
lisen[t][0].add(tri[1])
lisen[t][2].add(tri[2])
with open('output type fragment','w') as f:
for k in lisen.keys():
f.write(k+'\t'+','.join(lisen[k][0])+'|'+','.join(lisen[k][1])+'|'+','.join(lisen[k][2])+'\n')
print(len(lisen))

+ 43
- 0
genrate_fragments/step7_get_predicate_fragment.py View File

@@ -0,0 +1,43 @@
#encoding=utf-8
en2t = {}
with open('input entity fragment','r') as f:
for line in f:
dou = line[:-1].split('\t')
types = dou[1].replace('|','#').split('#')[4]
typeset = types.split(',')
en2t[dou[0]] = set()
for t in typeset:
if len(t)<6 and t!='-1' and len(t)>0:
en2t[dou[0]].add(t)
sen = set()
lisen = {}
for i in range(408261):#iterate every predicate
lisen['%d'%i] = set()
with open('triple file represented by ids here','r') as f:
i = 1
for line in f:
if i%100000==0:
print(i)
tri = line[:-1].split('\t')
if tri[0]!='-1':
pre = '['+','.join(en2t[tri[0]])+']'
else:
pre = '[]'
if tri[2]!='-1':
pos = '['+','.join(en2t[tri[2]])+']\n'
str = pre + '\t' + tri[1] + '\t' + pos
sen.add(str)
else:
lisen[tri[1]].add(tri[0])
for k in lisen.keys():
str = '['+','.join(lisen[k])+']\t'+k+'\tliteral\n'
sen.add(str)
with open('output predicate fragment file','w') as f:
for item in sen:
f.write(item)
print(len(sen))

Loading…
Cancel
Save