# encoding=utf-8 ''' Step2: remove the dubplicate triples. ''' triples = set() j = 1 i = 1 with open('./pkubase/pkubase-triples.txt','r') as f: while 1: line = f.readline() if not line: break triples.add(line) if j % 100000 == 0: print("%d:%d"%(i,j)) j += 1 j = 1 i = 2 with open('./pkubase/pkubase-types.txt','r') as f: while 1: line = f.readline() if not line: break triples.add(line) if j % 100000 == 0: print("%d:%d"%(i,j)) j += 1 print(len(triples)) wf = open('./pkubase/pkubase_clean.txt','w') for item in triples: wf.write(item)