You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dataset_loader.py 1.3 kB

7 years ago
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. from loader.base_loader import BaseLoader
  2. class DatasetLoader(BaseLoader):
  3. """"loader for data sets"""
  4. def __init__(self, data_name, data_path):
  5. super(DatasetLoader, self).__init__(data_name, data_path)
  6. class ConllLoader(DatasetLoader):
  7. """loader for conll format files"""
  8. def __int__(self, data_name, data_path):
  9. """
  10. :param str data_name: the name of the conll data set
  11. :param str data_path: the path to the conll data set
  12. """
  13. super(ConllLoader, self).__init__(data_name, data_path)
  14. self.data_set = self.parse(self.load())
  15. def load(self):
  16. """
  17. :return: list lines: all lines in a conll file
  18. """
  19. with open(self.data_path, "r", encoding="utf-8") as f:
  20. lines = f.readlines()
  21. return lines
  22. @staticmethod
  23. def parse(lines):
  24. """
  25. :param list lines:a list containing all lines in a conll file.
  26. :return: a 3D list
  27. """
  28. sentences = list()
  29. tokens = list()
  30. for line in lines:
  31. if line[0] == "#":
  32. # skip the comments
  33. continue
  34. if line == "\n":
  35. sentences.append(tokens)
  36. tokens = []
  37. continue
  38. tokens.append(line.split())
  39. return sentences

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等