|
- #encoding=utf-8
- en2t = {}
- with open('input entity fragment','r') as f:
- for line in f:
- dou = line[:-1].split('\t')
- types = dou[1].replace('|','#').split('#')[4]
- typeset = types.split(',')
- en2t[dou[0]] = set()
- for t in typeset:
- if len(t)<6 and t!='-1' and len(t)>0:
- en2t[dou[0]].add(t)
- sen = set()
- lisen = {}
- for i in range(408261):#iterate every predicate
- lisen['%d'%i] = set()
-
- with open('triple file represented by ids here','r') as f:
- i = 1
- for line in f:
- if i%100000==0:
- print(i)
- tri = line[:-1].split('\t')
- if tri[0]!='-1':
- pre = '['+','.join(en2t[tri[0]])+']'
- else:
- pre = '[]'
- if tri[2]!='-1':
- pos = '['+','.join(en2t[tri[2]])+']\n'
- str = pre + '\t' + tri[1] + '\t' + pos
- sen.add(str)
- else:
- lisen[tri[1]].add(tri[0])
-
- for k in lisen.keys():
- str = '['+','.join(lisen[k])+']\t'+k+'\tliteral\n'
- sen.add(str)
-
- with open('output predicate fragment file','w') as f:
- for item in sen:
- f.write(item)
- print(len(sen))
-
-
|