You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

build_system_pipeline.py 5.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. import argparse
  2. from d3m import index
  3. from d3m.metadata.base import ArgumentType
  4. from d3m.metadata.pipeline import Pipeline, PrimitiveStep
  5. # Creating pipeline
  6. pipeline_description = Pipeline()
  7. pipeline_description.add_input(name='inputs')
  8. #Step 0: Denormalise
  9. step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.common.denormalize'))
  10. step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
  11. step_0.add_output('produce')
  12. pipeline_description.add_step(step_0)
  13. #Step 1: Convert the dataset to a DataFrame
  14. step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe'))
  15. step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
  16. step_1.add_output('produce')
  17. pipeline_description.add_step(step_1)
  18. #Step 2: Read the csvs corresponding to the paths in the Dataframe in the form of arrays
  19. step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.common.csv_reader'))
  20. step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
  21. step_2.add_output('produce')
  22. step_2.add_hyperparameter(name = 'use_columns', argument_type=ArgumentType.VALUE, data = [0,1])
  23. step_2.add_hyperparameter(name = 'return_result', argument_type=ArgumentType.VALUE, data = 'replace')
  24. pipeline_description.add_step(step_2)
  25. #Step 3: Column Parser
  26. step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.column_parser'))
  27. step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
  28. step_3.add_output('produce')
  29. step_3.add_hyperparameter(name='parse_semantic_types', argument_type=ArgumentType.VALUE,
  30. data=['http://schema.org/Boolean','http://schema.org/Integer', 'http://schema.org/Float', 'https://metadata.datadrivendiscovery.org/types/FloatVector',])
  31. pipeline_description.add_step(step_3)
  32. # Step 4: extract_columns_by_semantic_types(attributes)
  33. step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.extract_columns_by_semantic_types'))
  34. step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce')
  35. step_4.add_output('produce')
  36. step_4.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
  37. data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
  38. pipeline_description.add_step(step_4)
  39. # Step 5: extract_columns_by_semantic_types(targets)
  40. step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.extract_columns_by_semantic_types'))
  41. step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce')
  42. step_5.add_output('produce')
  43. step_5.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
  44. data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
  45. pipeline_description.add_step(step_5)
  46. attributes = 'steps.4.produce'
  47. targets = 'steps.5.produce'
  48. # Step 6: processing
  49. step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.statistical_maximum'))
  50. step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
  51. step_6.add_output('produce')
  52. pipeline_description.add_step(step_6)
  53. # Step 7: algorithm
  54. #step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ae'))
  55. step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.pyod_ocsvm'))
  56. step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
  57. step_7.add_output('produce_score')
  58. pipeline_description.add_step(step_7)
  59. # Step 8: Predictions
  60. #step_8 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.construct_predictions'))
  61. step_8 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.system_wise_detection'))
  62. step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce_score')
  63. #step_8.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
  64. step_8.add_output('produce')
  65. pipeline_description.add_step(step_8)
  66. step_9 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.construct_predictions'))
  67. step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce')
  68. step_9.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
  69. step_9.add_output('produce')
  70. pipeline_description.add_step(step_9)
  71. # Final Output
  72. pipeline_description.add_output(name='output predictions', data_reference='steps.9.produce')
  73. # Output to json
  74. data = pipeline_description.to_json()
  75. with open('system_pipeline.json', 'w') as f:
  76. f.write(data)
  77. print(data)

全栈的自动化机器学习系统,主要针对多变量时间序列数据的异常检测。TODS提供了详尽的用于构建基于机器学习的异常检测系统的模块,它们包括:数据处理(data processing),时间序列处理( time series processing),特征分析(feature analysis),检测算法(detection algorithms),和强化模块( reinforcement module)。这些模块所提供的功能包括常见的数据预处理、时间序列数据的平滑或变换,从时域或频域中抽取特征、多种多样的检测算