hummingbird
/
gAnswer

import re
'''
Step 1: Clean the triple file. In the dbpedia case, we just need the  part of resource URI that indicate entity/type/predicate names.
'''
fileName = []#List of triple files to be process
notRdf = open('./notRdf.txt','w')#Record the lines that refers to a type but not rdf:type 
for index2,fname in enumerate(fileName):
	f = open('./'+fname)
	triple = open('output triple files here','w')
	prefix_f = open('output prefix files here','w')# save the prefix in files in case of it may be useful in the future. 
	i = 0
	count = 0
	prefix_set = set()
	for line in f:
		if line[0] != '<':
			print(i)
			i = i + 1
			count += 1
			continue
		line = line[:-3].replace('> <','>$-$-$<').replace('> "','>$-$-$"')
		line = line.split('$-$-$')
		if i==0:
			i += 1
			continue
		new_line=[]
		if "type>" in line[1]:
			if "rdf" not in line[1]:
				notRdf.write(str(line)+'\n')
				continue
		for index,item in enumerate(line):
			if not item:
				count +=1
				break  
			if item[0]=='<':
				pos = item.rfind('/')
				word = item[pos+1:-1].split("#")
				if len(word)<2:
					new_line.append('<'+word[0]+'>')
				else:
					new_line.append('<'+word[1]+'>')
				if index == 1:
					tmp = new_line[1][1:len(new_line[1])-1]
					pos2 = line[1].rfind(tmp)
					prefix = line[1][1:pos2-1]
					prefix_set.add(tmp + '^^^'+prefix+'\n')
					continue
			elif item.count('"') >=2:
				item = item.split('^^')[0].split('@')[0]
				pattern = re.compile('"(.*)"')
				word = '"'+''.join(pattern.findall(item))+'"'
				new_line.append(word)
				continue
			else:
				print(i)
		i += 1
		#print('\t'.join(new_line))
		if i%1000000==0:
			print("%d:%d"%(8,i))
		triple.write('\t'.join(new_line)+'\n')
	for item in prefix_set:
		prefix_f.write(item)
	f.close()
	triple.close()
	prefix_f.close()