from loader.base_loader import BaseLoader class DatasetLoader(BaseLoader): """"loader for data sets""" def __init__(self, data_name, data_path): super(DatasetLoader, self).__init__(data_name, data_path) class ConllLoader(DatasetLoader): """loader for conll format files""" def __int__(self, data_name, data_path): """ :param str data_name: the name of the conll data set :param str data_path: the path to the conll data set """ super(ConllLoader, self).__init__(data_name, data_path) self.data_set = self.parse(self.load()) def load(self): """ :return: list lines: all lines in a conll file """ with open(self.data_path, "r", encoding="utf-8") as f: lines = f.readlines() return lines @staticmethod def parse(lines): """ :param list lines:a list containing all lines in a conll file. :return: a 3D list """ sentences = list() tokens = list() for line in lines: if line[0] == "#": # skip the comments continue if line == "\n": sentences.append(tokens) tokens = [] continue tokens.append(line.split()) return sentences