You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

pipeline_construction_subseq.py 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. import uuid
  2. import random
  3. from d3m.metadata.pipeline import Pipeline
  4. from axolotl.algorithms.base import PipelineSearchBase
  5. from axolotl.utils import schemas as schemas_utils
  6. primitive_python_paths = { # pragma: no cover
  7. 'data_processing': [
  8. #'d3m.primitives.tods.data_processing.time_interval_transform',
  9. #'d3m.primitives.tods.data_processing.categorical_to_binary',
  10. #'d3m.primitives.tods.data_processing.column_filter',
  11. #'d3m.primitives.tods.data_processing.timestamp_validation',
  12. #'d3m.primitives.tods.data_processing.duplication_validation',
  13. #'d3m.primitives.tods.data_processing.continuity_validation',
  14. ],
  15. 'timeseries_processing': [
  16. #'d3m.primitives.tods.timeseries_processing.transformation.axiswise_scaler',
  17. 'd3m.primitives.tods.timeseries_processing.subsequence_segmentation',
  18. #'d3m.primitives.tods.timeseries_processing.transformation.standard_scaler',
  19. #'d3m.primitives.tods.timeseries_processing.transformation.power_transformer',
  20. #'d3m.primitives.tods.timeseries_processing.transformation.quantile_transformer',
  21. #'d3m.primitives.tods.timeseries_processing.transformation.moving_average_transform',
  22. #'d3m.primitives.tods.timeseries_processing.transformation.simple_exponential_smoothing',
  23. #'d3m.primitives.tods.timeseries_processing.transformation.holt_smoothing',
  24. #'d3m.primitives.tods.timeseries_processing.transformation.holt_winters_exponential_smoothing',
  25. #'d3m.primitives.tods.timeseries_processing.decomposition.time_series_seasonality_trend_decomposition',
  26. ],
  27. 'feature_analysis': [
  28. #'d3m.primitives.tods.feature_analysis.auto_correlation',
  29. #'d3m.primitives.tods.feature_analysis.statistical_mean',
  30. #'d3m.primitives.tods.feature_analysis.statistical_median',
  31. #'d3m.primitives.tods.feature_analysis.statistical_g_mean',
  32. #'d3m.primitives.tods.feature_analysis.statistical_abs_energy',
  33. #'d3m.primitives.tods.feature_analysis.statistical_abs_sum',
  34. #'d3m.primitives.tods.feature_analysis.statistical_h_mean',
  35. #'d3m.primitives.tods.feature_analysis.statistical_maximum',
  36. #'d3m.primitives.tods.feature_analysis.statistical_minimum',
  37. #'d3m.primitives.tods.feature_analysis.statistical_mean_abs',
  38. #'d3m.primitives.tods.feature_analysis.statistical_mean_abs_temporal_derivative',
  39. #'d3m.primitives.tods.feature_analysis.statistical_mean_temporal_derivative',
  40. #'d3m.primitives.tods.feature_analysis.statistical_median_abs_deviation',
  41. #'d3m.primitives.tods.feature_analysis.statistical_kurtosis',
  42. #'d3m.primitives.tods.feature_analysis.statistical_skew',
  43. #'d3m.primitives.tods.feature_analysis.statistical_std',
  44. #'d3m.primitives.tods.feature_analysis.statistical_var',
  45. #'d3m.primitives.tods.feature_analysis.statistical_variation',
  46. #'d3m.primitives.tods.feature_analysis.statistical_vec_sum',
  47. #'d3m.primitives.tods.feature_analysis.statistical_willison_amplitude',
  48. #'d3m.primitives.tods.feature_analysis.statistical_zero_crossing',
  49. #'d3m.primitives.tods.feature_analysis.spectral_residual_transform',
  50. #'d3m.primitives.tods.feature_analysis.fast_fourier_transform',
  51. #'d3m.primitives.tods.feature_analysis.discrete_cosine_transform',
  52. #'d3m.primitives.tods.feature_analysis.non_negative_matrix_factorization',
  53. #'d3m.primitives.tods.feature_analysis.bk_filter',
  54. #'d3m.primitives.tods.feature_analysis.hp_filter',
  55. #'d3m.primitives.tods.feature_analysis.truncated_svd',
  56. #'d3m.primitives.tods.feature_analysis.wavelet_transform',
  57. #'d3m.primitives.tods.feature_analysis.trmf',
  58. ],
  59. 'detection_algorithm': [
  60. #'d3m.primitives.tods.detection_algorithm.pyod_ae',
  61. #'d3m.primitives.tods.detection_algorithm.pyod_vae',
  62. #'d3m.primitives.tods.detection_algorithm.pyod_cof',
  63. #'d3m.primitives.tods.detection_algorithm.pyod_sod',
  64. #'d3m.primitives.tods.detection_algorithm.pyod_abod',
  65. #'d3m.primitives.tods.detection_algorithm.pyod_hbos',
  66. 'd3m.primitives.tods.detection_algorithm.pyod_iforest',
  67. #'d3m.primitives.tods.detection_algorithm.pyod_lof',
  68. #'d3m.primitives.tods.detection_algorithm.pyod_knn',
  69. 'd3m.primitives.tods.detection_algorithm.pyod_ocsvm',
  70. #'d3m.primitives.tods.detection_algorithm.pyod_loda',
  71. #'d3m.primitives.tods.detection_algorithm.pyod_cblof',
  72. 'd3m.primitives.tods.detection_algorithm.pyod_sogaal',
  73. #'d3m.primitives.tods.detection_algorithm.pyod_mogaal',
  74. #'d3m.primitives.tods.detection_algorithm.matrix_profile',
  75. #'d3m.primitives.tods.detection_algorithm.AutoRegODetector',
  76. #'d3m.primitives.tods.detection_algorithm.LSTMODetector',
  77. #'d3m.primitives.tods.detection_algorithm.AutoRegODetector',
  78. #'d3m.primitives.tods.detection_algorithm.PCAODetector',
  79. #'d3m.primitives.tods.detection_algorithm.KDiscordODetector',
  80. #'d3m.primitives.tods.detection_algorithm.deeplog',
  81. #'d3m.primitives.tods.detection_algorithm.telemanom',
  82. ],
  83. 'contamination': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
  84. }
  85. def _generate_pipeline(combinations):
  86. from d3m import index
  87. from d3m.metadata.base import ArgumentType
  88. from d3m.metadata.pipeline import Pipeline, PrimitiveStep
  89. piplines = []
  90. for combination in combinations:
  91. # Creating pipeline
  92. pipeline_description = Pipeline()
  93. pipeline_description.add_input(name='inputs')
  94. # The first three steps are fixed
  95. # Step 0: dataset_to_dataframe
  96. step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe'))
  97. step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
  98. step_0.add_output('produce')
  99. pipeline_description.add_step(step_0)
  100. # Step 1: column_parser
  101. step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.column_parser'))
  102. step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
  103. step_1.add_output('produce')
  104. pipeline_description.add_step(step_1)
  105. # Step 2: extract_columns_by_semantic_types(attributes)
  106. step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.extract_columns_by_semantic_types'))
  107. step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
  108. step_2.add_output('produce')
  109. step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
  110. data=['https://metadata.datadrivendiscovery.org/types/Attribute'])
  111. pipeline_description.add_step(step_2)
  112. # Step 3: extract_columns_by_semantic_types(targets)
  113. step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.extract_columns_by_semantic_types'))
  114. step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
  115. step_3.add_output('produce')
  116. step_3.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE,
  117. data=['https://metadata.datadrivendiscovery.org/types/TrueTarget'])
  118. pipeline_description.add_step(step_3)
  119. attributes = 'steps.2.produce'
  120. targets = 'steps.3.produce'
  121. tods_step_4 = PrimitiveStep(primitive=index.get_primitive(combination[0]))
  122. tods_step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference=attributes)
  123. tods_step_4.add_hyperparameter(name='window_size', argument_type=ArgumentType.VALUE, data=10)
  124. tods_step_4.add_hyperparameter(name='step', argument_type=ArgumentType.VALUE, data=1)
  125. tods_step_4.add_output('produce')
  126. pipeline_description.add_step(tods_step_4)
  127. tods_step_5= PrimitiveStep(primitive=index.get_primitive(combination[1]))
  128. tods_step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
  129. tods_step_5.add_output('produce')
  130. tods_step_5.add_hyperparameter(name='contamination', argument_type=ArgumentType.VALUE, data=combination[2])
  131. pipeline_description.add_step(tods_step_5)
  132. # Finalize the pipeline
  133. final_step = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.construct_predictions'))
  134. final_step.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
  135. final_step.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
  136. final_step.add_output('produce')
  137. pipeline_description.add_step(final_step)
  138. pipeline_description.add_output(name='output predictions', data_reference='steps.6.produce')
  139. data = pipeline_description.to_json()
  140. #with open('../pipelines/'+str(combination[1].split(".")[-1])+'_'+str(combination[2])+".json", 'w') as f:
  141. with open('./pipelines/subseq/'+str(combination[1].split(".")[-1])+'_subseq_'+str(combination[2])+".json", 'w') as f:
  142. f.write(data)
  143. pipeline_description.id = str(uuid.uuid4())
  144. pipeline_description.created = Pipeline().created
  145. piplines.append(pipeline_description)
  146. return piplines
  147. def _generate_pipelines(primitive_python_paths, cpu_count=40): # pragma: no cover
  148. """
  149. Args:
  150. primitive_python_paths: a list of primitive Python paths for algorithms
  151. Returns:
  152. the pipline description json
  153. """
  154. import itertools
  155. import multiprocessing as mp
  156. #components = ['data_processing', 'timeseries_processing', 'feature_analysis', 'detection_algorithm']
  157. components = ['timeseries_processing', 'detection_algorithm', 'contamination']
  158. combinations = itertools.product(*(primitive_python_paths[k] for k in components))
  159. return _generate_pipeline(combinations)
  160. if __name__ == "__main__":
  161. combinations = _generate_pipelines(primitive_python_paths)
  162. print(combinations)

全栈的自动化机器学习系统,主要针对多变量时间序列数据的异常检测。TODS提供了详尽的用于构建基于机器学习的异常检测系统的模块,它们包括:数据处理(data processing),时间序列处理( time series processing),特征分析(feature analysis),检测算法(detection algorithms),和强化模块( reinforcement module)。这些模块所提供的功能包括常见的数据预处理、时间序列数据的平滑或变换,从时域或频域中抽取特征、多种多样的检测算