You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

base_loader.py 759 B

7 years ago
12345678910111213141516171819202122232425262728293031
  1. class BaseLoader(object):
  2. """docstring for BaseLoader"""
  3. def __init__(self, data_name, data_path):
  4. super(BaseLoader, self).__init__()
  5. self.data_name = data_name
  6. self.data_path = data_path
  7. def load(self):
  8. """
  9. :return: string
  10. """
  11. with open(self.data_path, "r", encoding="utf-8") as f:
  12. text = f.read()
  13. return text
  14. class ToyLoader0(BaseLoader):
  15. """
  16. For charLM
  17. """
  18. def __init__(self, name, path):
  19. super(ToyLoader0, self).__init__(name, path)
  20. def load(self):
  21. with open(self.data_path, 'r') as f:
  22. corpus = f.read().lower()
  23. import re
  24. corpus = re.sub(r"<unk>", "unk", corpus)
  25. return corpus.split()

一款轻量级的自然语言处理(NLP)工具包,目标是减少用户项目中的工程型代码,例如数据处理循环、训练循环、多卡运行等