import re ''' Step 1: Clean the triple file. In the dbpedia case, we just need the part of resource URI that indicate entity/type/predicate names. ''' fileName = []#List of triple files to be process notRdf = open('./notRdf.txt','w')#Record the lines that refers to a type but not rdf:type for index2,fname in enumerate(fileName): f = open('./'+fname) triple = open('output triple files here','w') prefix_f = open('output prefix files here','w')# save the prefix in files in case of it may be useful in the future. i = 0 count = 0 prefix_set = set() for line in f: if line[0] != '<': print(i) i = i + 1 count += 1 continue line = line[:-3].replace('> <','>$-$-$<').replace('> "','>$-$-$"') line = line.split('$-$-$') if i==0: i += 1 continue new_line=[] if "type>" in line[1]: if "rdf" not in line[1]: notRdf.write(str(line)+'\n') continue for index,item in enumerate(line): if not item: count +=1 break if item[0]=='<': pos = item.rfind('/') word = item[pos+1:-1].split("#") if len(word)<2: new_line.append('<'+word[0]+'>') else: new_line.append('<'+word[1]+'>') if index == 1: tmp = new_line[1][1:len(new_line[1])-1] pos2 = line[1].rfind(tmp) prefix = line[1][1:pos2-1] prefix_set.add(tmp + '^^^'+prefix+'\n') continue elif item.count('"') >=2: item = item.split('^^')[0].split('@')[0] pattern = re.compile('"(.*)"') word = '"'+''.join(pattern.findall(item))+'"' new_line.append(word) continue else: print(i) i += 1 #print('\t'.join(new_line)) if i%1000000==0: print("%d:%d"%(8,i)) triple.write('\t'.join(new_line)+'\n') for item in prefix_set: prefix_f.write(item) f.close() triple.close() prefix_f.close()