Browse Source

merge common

Former-commit-id: 082a7acf57 [formerly 1f74e655a7] [formerly 69763f4c12 [formerly a893d7685f]] [formerly cf3b68a678 [formerly 99d8f9b778] [formerly 18f539f981 [formerly ef1836f33f]]] [formerly bd8884495d [formerly 7fbf002dac] [formerly f5a45b390b [formerly 5ff75e2e46]] [formerly 1b40e48dd8 [formerly f323cbe994] [formerly 84c2b485e4 [formerly da5752974d]]]] [formerly 79eafae538 [formerly c68cb0cba1] [formerly c98bc71cb5 [formerly f3f012f5fb]] [formerly f202a31ab4 [formerly 4694d83a52] [formerly a1f0aaca82 [formerly eb090d6d3a]]] [formerly 37c4555bd0 [formerly 55dfcc63fb] [formerly 302c56c509 [formerly bebdfd98dd]] [formerly eed97e12bd [formerly 67f3d10947] [formerly 395df67343 [formerly c43cf776c6]]]]] [formerly c4997cb6a0 [formerly c4703c17ec] [formerly 7529580984 [formerly 9c000b2c79]] [formerly b102c6c9b5 [formerly fd861756c5] [formerly 537f077ec0 [formerly ee573b901f]]] [formerly 105d925a3d [formerly 8cdc22f1d9] [formerly 39e5bc7d5d [formerly 5edf837521]] [formerly 21b0344ffc [formerly 41e32435f7] [formerly cf73b9408f [formerly 04b883bb8e]]]] [formerly e732011aec [formerly 163307ea6f] [formerly 38c9e8ff5e [formerly 2c63f40878]] [formerly dc9037bcb6 [formerly 9d15047b5b] [formerly 4da5904375 [formerly 577283a827]]] [formerly 238094954c [formerly 99f59e110c] [formerly a7bba7d99c [formerly f7b13e25e8]] [formerly 411b1ea01b [formerly baabbad21a] [formerly 869366cb7a [formerly 733cfb4398]]]]]]
Former-commit-id: 5ac3e41be2 [formerly fe85759519] [formerly bc7c03db32 [formerly 9bb2fa5132]] [formerly 3b49901893 [formerly 5dd1b25ebc] [formerly b7a93df358 [formerly 7b3f6e3090]]] [formerly 627cceb26b [formerly b0f3ed5f08] [formerly 1672ff9df1 [formerly 8eb215a652]] [formerly c9a8fc553b [formerly 7a69e6e65c] [formerly 5418882e2d [formerly 19352e6507]]]] [formerly ce5bc94f1c [formerly 65c63ca77b] [formerly 00098b391f [formerly 4e4b706c5f]] [formerly 16fbe6ff8e [formerly ad40273329] [formerly f113cd10e6 [formerly 5b3aa4a777]]] [formerly 7af46bef20 [formerly 0fe5b7dbb2] [formerly 712f4701ed [formerly 273adfd4eb]] [formerly 226de99943 [formerly 626f787629] [formerly 869366cb7a]]]]
Former-commit-id: 912fde5a07 [formerly 6c947b866f] [formerly 799d5800c2 [formerly ef77c736ef]] [formerly 811bc3d01f [formerly 341febd7c4] [formerly 13c141596a [formerly 933e5935f1]]] [formerly b77ccf87c1 [formerly f8fc926b80] [formerly 0632053739 [formerly 73abb0cf6a]] [formerly e14b39bc6e [formerly ff77272b0d] [formerly e9bbc7aff8 [formerly b6be52bd79]]]]
Former-commit-id: 98f7b4cc65 [formerly 3f70f5e2aa] [formerly f1df66b510 [formerly 36f63495a4]] [formerly 1b39a6508e [formerly db3e50f7dd] [formerly a36fc0aa02 [formerly e4e3039ce7]]]
Former-commit-id: 05ff21ba2f [formerly b98dd6488a] [formerly c4dbfe74c2 [formerly 7451b535e9]]
Former-commit-id: 51c8386465 [formerly 3b85eedffd]
Former-commit-id: 90d4dc1597
master
lhenry15 4 years ago
parent
commit
4069de2744
100 changed files with 0 additions and 18875 deletions
  1. +0
    -363
      common-primitives/HISTORY.md
  2. +0
    -94
      common-primitives/HOW_TO_MANAGE.md
  3. +0
    -201
      common-primitives/LICENSE.txt
  4. +0
    -2
      common-primitives/MANIFEST.in
  5. +0
    -83
      common-primitives/README.md
  6. +0
    -24
      common-primitives/add.sh
  7. +0
    -63
      common-primitives/entry_points.ini
  8. +0
    -5
      common-primitives/git-add.sh
  9. +0
    -21
      common-primitives/git-check.sh
  10. +0
    -32
      common-primitives/list_primitives.py
  11. BIN
      common-primitives/pipeline_runs/classification.light_gbm.DataFrameCommon/1.yaml.gz
  12. BIN
      common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/1.yaml.gz
  13. +0
    -1
      common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz
  14. BIN
      common-primitives/pipeline_runs/classification.xgboost_dart.DataFrameCommon/1.yaml.gz
  15. BIN
      common-primitives/pipeline_runs/classification.xgboost_gbtree.DataFrameCommon/1.yaml.gz
  16. BIN
      common-primitives/pipeline_runs/data_augmentation.datamart_augmentation.Common/2.yaml.gz
  17. BIN
      common-primitives/pipeline_runs/data_preprocessing.dataset_sample.Common/1.yaml.gz
  18. +0
    -1
      common-primitives/pipeline_runs/data_preprocessing.one_hot_encoder.PandasCommon/pipeline_run_extract_structural_types.yml.gz
  19. +0
    -1
      common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/1.yaml.gz
  20. +0
    -1
      common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz
  21. +0
    -1
      common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_group_field_compose.yml.gz
  22. +0
    -1
      common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/1.yaml.gz
  23. +0
    -1
      common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz
  24. +0
    -1
      common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/1.yaml.gz
  25. +0
    -1
      common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_extract_structural_types.yml.gz
  26. +0
    -1
      common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_group_field_compose.yml.gz
  27. +0
    -1
      common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/1.yaml.gz
  28. +0
    -1
      common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz
  29. BIN
      common-primitives/pipeline_runs/data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz
  30. BIN
      common-primitives/pipeline_runs/data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz
  31. +0
    -1
      common-primitives/pipeline_runs/data_transformation.horizontal_concat.DataFrameConcat/1.yaml.gz
  32. +0
    -1
      common-primitives/pipeline_runs/data_transformation.remove_columns.Common/pipeline_run_extract_structural_types.yml.gz
  33. +0
    -4729
      common-primitives/pipeline_runs/regression.xgboost_gbtree.DataFrameCommon/1.yml
  34. +0
    -1
      common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_extract_structural_types.yml.gz
  35. +0
    -1
      common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_group_field_compose.yml.gz
  36. +0
    -246
      common-primitives/pipelines/classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json
  37. +0
    -1
      common-primitives/pipelines/classification.random_forest.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json
  38. +0
    -110
      common-primitives/pipelines/classification.random_forest.DataFrameCommon/ccad0f9c-130e-4063-a91e-ea65a18cb041.yaml
  39. +0
    -246
      common-primitives/pipelines/classification.xgboost_dart.DataFrameCommon/b7a24816-2518-4073-9c45-b97f2b2fee30.json
  40. +0
    -246
      common-primitives/pipelines/classification.xgboost_gbtree.DataFrameCommon/4d402450-2562-48cc-93fd-719fb658c43c.json
  41. +0
    -522
      common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/3afd2bd2-7ba1-4ac1-928f-fad0c39a05e5.json
  42. +0
    -342
      common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/4ff2f21d-1bba-4c44-bb96-e05728bcf6ed.json
  43. +0
    -123
      common-primitives/pipelines/data_preprocessing.dataset_sample.Common/387d432a-9893-4558-b190-1c5e9e399dbf.yaml
  44. +0
    -300
      common-primitives/pipelines/data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json
  45. +0
    -1
      common-primitives/pipelines/data_preprocessing.one_hot_encoder.PandasCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json
  46. +0
    -1
      common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json
  47. +0
    -1
      common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/a8c40699-c48d-4f12-aa18-639c5fb6baae.json
  48. +0
    -1
      common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json
  49. +0
    -1
      common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json
  50. +0
    -1
      common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json
  51. +0
    -1
      common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json
  52. +0
    -1
      common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json
  53. +0
    -1
      common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json
  54. +0
    -1
      common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json
  55. +0
    -1
      common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json
  56. +0
    -1
      common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json
  57. +0
    -1
      common-primitives/pipelines/data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json
  58. +0
    -71
      common-primitives/pipelines/data_transformation.extract_columns.Common/pipeline.py
  59. +0
    -1
      common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json
  60. +0
    -1
      common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json
  61. +0
    -1
      common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json
  62. +0
    -83
      common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/pipeline.py
  63. +0
    -1
      common-primitives/pipelines/data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json
  64. +0
    -100
      common-primitives/pipelines/data_transformation.grouping_field_compose.Common/pipeline.py
  65. +0
    -1
      common-primitives/pipelines/data_transformation.horizontal_concat.DataFrameConcat/2b307634-f01e-412e-8d95-7e54afd4731f.json
  66. +0
    -1
      common-primitives/pipelines/data_transformation.remove_columns.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json
  67. +0
    -272
      common-primitives/pipelines/data_transformation.rename_duplicate_name.DataFrameCommon/11ee9290-992d-4e48-97ed-1a6e4c15f92f.json
  68. +0
    -83
      common-primitives/pipelines/evaluation.kfold_timeseries_split.Common/k-fold-timeseries-split.yml
  69. +0
    -108
      common-primitives/pipelines/operator.dataset_map.DataFrameCommon/k-fold-timeseries-split-raw.yml
  70. +0
    -247
      common-primitives/pipelines/regression.xgboost_gbtree.DataFrameCommon/0f636602-6299-411b-9873-4b974cd393ba.json
  71. +0
    -1
      common-primitives/pipelines/schema_discovery.profiler.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json
  72. +0
    -1
      common-primitives/pipelines/schema_discovery.profiler.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json
  73. +0
    -1
      common-primitives/pipelines/schema_discovery.profiler.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json
  74. +0
    -44
      common-primitives/run_pipelines.sh
  75. +0
    -11
      common-primitives/run_tests.py
  76. +0
    -28
      common-primitives/setup.cfg
  77. +0
    -65
      common-primitives/setup.py
  78. +0
    -2
      common-primitives/sklearn-wrap/.gitignore
  79. +0
    -31
      common-primitives/sklearn-wrap/requirements.txt
  80. +0
    -106
      common-primitives/sklearn-wrap/setup.py
  81. +0
    -470
      common-primitives/sklearn-wrap/sklearn_wrap/SKARDRegression.py
  82. +0
    -498
      common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostClassifier.py
  83. +0
    -437
      common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostRegressor.py
  84. +0
    -589
      common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingClassifier.py
  85. +0
    -533
      common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingRegressor.py
  86. +0
    -508
      common-primitives/sklearn-wrap/sklearn_wrap/SKBernoulliNB.py
  87. +0
    -330
      common-primitives/sklearn-wrap/sklearn_wrap/SKBinarizer.py
  88. +0
    -490
      common-primitives/sklearn-wrap/sklearn_wrap/SKCountVectorizer.py
  89. +0
    -621
      common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeClassifier.py
  90. +0
    -565
      common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeRegressor.py
  91. +0
    -503
      common-primitives/sklearn-wrap/sklearn_wrap/SKDummyClassifier.py
  92. +0
    -442
      common-primitives/sklearn-wrap/sklearn_wrap/SKDummyRegressor.py
  93. +0
    -466
      common-primitives/sklearn-wrap/sklearn_wrap/SKElasticNet.py
  94. +0
    -675
      common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesClassifier.py
  95. +0
    -607
      common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesRegressor.py
  96. +0
    -439
      common-primitives/sklearn-wrap/sklearn_wrap/SKFastICA.py
  97. +0
    -361
      common-primitives/sklearn-wrap/sklearn_wrap/SKFeatureAgglomeration.py
  98. +0
    -492
      common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianNB.py
  99. +0
    -463
      common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianProcessRegressor.py
  100. +0
    -344
      common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianRandomProjection.py

+ 0
- 363
common-primitives/HISTORY.md View File

@@ -1,363 +0,0 @@
## v0.8.0

* Removed multi-targets support in `classification.light_gbm.Common` and fixed
categorical attributes handling.
[!118](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/118)
* Unified date parsing across primitives.
Added `raise_error` hyper-parameter to `data_preprocessing.datetime_range_filter.Common`.
This bumped the version of the primitive.
[!117](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/117)
* `evaluation.kfold_time_series_split.Common` now parses the datetime column
before sorting. `fuzzy_time_parsing` hyper-parameter was added to the primitive.
This bumped the version of the primitive.
[!110](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/110)
* Added option `equal` to hyper-parameter `match_logic` of primitive
`data_transformation.extract_columns_by_semantic_types.Common` to support set equality
when determining columns to extract. This bumped the version of the primitive.
[!116](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/116)
* Fixed `data_preprocessing.one_hot_encoder.MakerCommon` to work with the
latest core package.
* `data_cleaning.tabular_extractor.Common` has been fixed to work with the
latest version of sklearn.
[!113](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/113)
* ISI side of `data_augmentation.datamart_augmentation.Common` and
`data_augmentation.datamart_download.Common` has been updated.
[!108](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/108)
* Improved how pipelines and pipeline runs for all primitives are managed.
Many more pipelines and pipeline runs were added.
* `evaluation.kfold_timeseries_split.Common` has been renamed to `evaluation.kfold_time_series_split.Common`.
* Fixed `data_preprocessing.dataset_sample.Common` on empty input.
[!95](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/95)
* `data_preprocessing.datetime_range_filter.Common` does not assume local timezone
when parsing dates.
[#115](https://gitlab.com/datadrivendiscovery/common-primitives/issues/115)
* Added `fuzzy_time_parsing` hyper-parameter to `data_transformation.column_parser.Common`.
This bumped the version of the primitive.
* Fixed `data_transformation.column_parser.Common` to work correctly with `python-dateutil==2.8.1`.
[#119](https://gitlab.com/datadrivendiscovery/common-primitives/issues/119).
* Refactored `data_preprocessing.one_hot_encoder.MakerCommon` to address some issues.
[#66](https://gitlab.com/datadrivendiscovery/common-primitives/issues/66)
[#75](https://gitlab.com/datadrivendiscovery/common-primitives/issues/75)
[!96](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/96)
* Added support for handling of numeric columns to `data_preprocessing.regex_filter.Common` and `data_preprocessing.term_filter.Common`.
[!101](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/101)
[!104](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/104)
* Fixed exception in `produce` method in `data_transformation.datetime_field_compose.Common` caused by using incorrect type for dataframe indexer.
[!102](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/102)
* Added primitives:
* `data_transformation.grouping_field_compose.Common`

## v0.7.0

* Renamed primitives:
* `data_transformation.add_semantic_types.DataFrameCommon` to `data_transformation.add_semantic_types.Common`
* `data_transformation.remove_semantic_types.DataFrameCommon` to `data_transformation.remove_semantic_types.Common`
* `data_transformation.replace_semantic_types.DataFrameCommon` to `data_transformation.replace_semantic_types.Common`
* `operator.column_map.DataFrameCommon` to `operator.column_map.Common`
* `regression.xgboost_gbtree.DataFrameCommon` to `regression.xgboost_gbtree.Common`
* `classification.light_gbm.DataFrameCommon` to `classification.light_gbm.Common`
* `classification.xgboost_gbtree.DataFrameCommon` to `classification.xgboost_gbtree.Common`
* `classification.xgboost_dart.DataFrameCommon` to `classification.xgboost_dart.Common`
* `classification.random_forest.DataFrameCommon` to `classification.random_forest.Common`
* `data_transformation.extract_columns.DataFrameCommon` to `data_transformation.extract_columns.Common`
* `data_transformation.extract_columns_by_semantic_types.DataFrameCommon` to `data_transformation.extract_columns_by_semantic_types.Common`
* `data_transformation.extract_columns_by_structural_types.DataFrameCommon` to `data_transformation.extract_columns_by_structural_types.Common`
* `data_transformation.cut_audio.DataFrameCommon` to `data_transformation.cut_audio.Common`
* `data_transformation.column_parser.DataFrameCommon` to `data_transformation.column_parser.Common`
* `data_transformation.remove_columns.DataFrameCommon` to `data_transformation.remove_columns.Common`
* `data_transformation.remove_duplicate_columns.DataFrameCommon` to `data_transformation.remove_duplicate_columns.Common`
* `data_transformation.horizontal_concat.DataFrameConcat` to `data_transformation.horizontal_concat.DataFrameCommon`
* `data_transformation.construct_predictions.DataFrameCommon` to `data_transformation.construct_predictions.Common`
* `data_transformation.datetime_field_compose.DataFrameCommon` to `data_transformation.datetime_field_compose.Common`
* `data_preprocessing.label_encoder.DataFrameCommon` to `data_preprocessing.label_encoder.Common`
* `data_preprocessing.label_decoder.DataFrameCommon` to `data_preprocessing.label_decoder.Common`
* `data_preprocessing.image_reader.DataFrameCommon` to `data_preprocessing.image_reader.Common`
* `data_preprocessing.text_reader.DataFrameCommon` to `data_preprocessing.text_reader.Common`
* `data_preprocessing.video_reader.DataFrameCommon` to `data_preprocessing.video_reader.Common`
* `data_preprocessing.csv_reader.DataFrameCommon` to `data_preprocessing.csv_reader.Common`
* `data_preprocessing.audio_reader.DataFrameCommon` to `data_preprocessing.audio_reader.Common`
* `data_preprocessing.regex_filter.DataFrameCommon` to `data_preprocessing.regex_filter.Common`
* `data_preprocessing.term_filter.DataFrameCommon` to `data_preprocessing.term_filter.Common`
* `data_preprocessing.numeric_range_filter.DataFrameCommon` to `data_preprocessing.numeric_range_filter.Common`
* `data_preprocessing.datetime_range_filter.DataFrameCommon` to `data_preprocessing.datetime_range_filter.Common`

## v0.6.0

* Added `match_logic`, `negate`, and `add_index_columns` hyper-parameters
to `data_transformation.extract_columns_by_structural_types.DataFrameCommon`
and `data_transformation.extract_columns_by_semantic_types.DataFrameCommon`
primitives.
* `feature_extraction.sparse_pca.Common` has been removed and is now available as part of realML.
[!89](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/89)
* Added new primitives:
* `data_preprocessing.datetime_range_filter.DataFrameCommon`
* `data_transformation.datetime_field_compose.DataFrameCommon`
* `d3m.primitives.data_preprocessing.flatten.DataFrameCommon`
* `data_augmentation.datamart_augmentation.Common`
* `data_augmentation.datamart_download.Common`
* `data_preprocessing.dataset_sample.Common`

[#53](https://gitlab.com/datadrivendiscovery/common-primitives/issues/53)
[!86](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/86)
[!87](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/87)
[!85](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/85)
[!63](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/63)
[!92](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/92)
[!93](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/93)
[!81](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/81)

* Fixed `fit` method to return correct value for `operator.column_map.DataFrameCommon`,
`operator.dataset_map.DataFrameCommon`, and `schema_discovery.profiler.Common`.
* Some not maintained primitives have been disabled. If you are using them, consider adopting them.
* `classification.bayesian_logistic_regression.Common`
* `regression.convolutional_neural_net.TorchCommon`
* `operator.diagonal_mvn.Common`
* `regression.feed_forward_neural_net.TorchCommon`
* `data_preprocessing.image_reader.Common`
* `clustering.k_means.Common`
* `regression.linear_regression.Common`
* `regression.loss.TorchCommon`
* `feature_extraction.pca.Common`
* `data_transformation.update_semantic_types.DatasetCommon` has been removed.
Use `data_transformation.add_semantic_types.DataFrameCommon`,
`data_transformation.remove_semantic_types.DataFrameCommon`,
or `data_transformation.replace_semantic_types.DataFrameCommon` together with
`operator.dataset_map.DataFrameCommon` primitive to obtain previous functionality.
[#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83)
* `data_transformation.remove_columns.DatasetCommon` has been removed.
Use `data_transformation.remove_columns.DataFrameCommon` together with
`operator.dataset_map.DataFrameCommon` primitive to obtain previous functionality.
[#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83)
* Some primitives which operate on Dataset have been converted to operate
on DataFrame and renamed. Use them together with `operator.dataset_map.DataFrameCommon`
primitive to obtain previous functionality.
* `data_preprocessing.regex_filter.DatasetCommon` to `data_preprocessing.regex_filter.DataFrameCommon`
* `data_preprocessing.term_filter.DatasetCommon` to `data_preprocessing.term_filter.DataFrameCommon`
* `data_preprocessing.numeric_range_filter.DatasetCommon` to `data_preprocessing.numeric_range_filter.DataFrameCommon`

[#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83)
[!84](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/84)

* `schema_discovery.profiler.Common` has been improved:
* More options added to `detect_semantic_types`.
* Added new `remove_unknown_type` hyper-parameter.

## v0.5.0

* `evaluation.compute_scores.Common` primitive has been moved to the core
package and renamed to `evaluation.compute_scores.Core`.
* `metafeature_extraction.compute_metafeatures.Common` has been renamed to
`metalearning.metafeature_extractor.Common`
* `evaluation.compute_scores.Common` has now a `add_normalized_scores` hyper-parameter
to control adding also a column with normalized scores to the output, which is now
added by default.
* `data_preprocessing.text_reader.DataFrameCommon` primitive has been fixed.
* `data_transformation.rename_duplicate_name.DataFrameCommon` primitive was
fixed to handle all types of column names.
[#73](https://gitlab.com/datadrivendiscovery/common-primitives/issues/73)
[!65](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/65)
* Added new primitives:
* `data_cleaning.tabular_extractor.Common`
* `data_preprocessing.one_hot_encoder.PandasCommon`
* `schema_discovery.profiler.Common`
* `data_transformation.ravel.DataFrameRowCommon`
* `operator.column_map.DataFrameCommon`
* `operator.dataset_map.DataFrameCommon`
* `data_transformation.normalize_column_references.Common`
* `data_transformation.normalize_graphs.Common`
* `feature_extraction.sparse_pca.Common`
* `evaluation.kfold_timeseries_split.Common`

[#57](https://gitlab.com/datadrivendiscovery/common-primitives/issues/57)
[!42](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/42)
[!44](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/44)
[!47](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/47)
[!71](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/71)
[!73](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/73)
[!77](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/77)
[!66](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/66)
[!67](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/67)

* Added hyper-parameter `error_on_no_columns` to `classification.random_forest.DataFrameCommon`.
* Common primitives have been updated to latest changes in d3m core package.
* Many utility functions from `utils.py` have been moved to the d3m core package.

## v0.4.0

* Renamed `data_preprocessing.one_hot_encoder.Common` to
`data_preprocessing.one_hot_encoder.MakerCommon` and reimplement it.
[!54](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/54)
* Added new primitives:
* `classification.xgboost_gbtree.DataFrameCommon`
* `classification.xgboost_dart.DataFrameCommon`
* `regression.xgboost_gbtree.DataFrameCommon`
* `classification.light_gbm.DataFrameCommon`
* `data_transformation.rename_duplicate_name.DataFrameCommon`

[!45](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/45)
[!46](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/46)
[!49](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/49)

* Made sure `utils.select_columns` works also when given a tuple of columns, and not a list.
[!58](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/58)
* `classification.random_forest.DataFrameCommon` updated so that produced columns have
names matching column names during fitting. Moreover, `produce_feature_importances`
return a `DataFrame` with each column being one feature and having one row with
importances.
[!59](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/59)
* `regression.feed_forward_neural_net.TorchCommon` updated to support
selection of columns using semantic types.
[!57](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/57)

## v0.3.0

* Made `evaluation.redact_columns.Common` primitive more general so that it can
redact any columns based on their semantic type and not just targets.
* Renamed primitives:
* `datasets.Denormalize` to `data_transformation.denormalize.Common`
* `datasets.DatasetToDataFrame` to `data_transformation.dataset_to_dataframe.Common`
* `evaluation.ComputeScores` to `evaluation.compute_scores.Common`
* `evaluation.RedactTargets` to `evaluation.redact_columns.Common`
* `evaluation.KFoldDatasetSplit` to `evaluation.kfold_dataset_split.Common`
* `evaluation.TrainScoreDatasetSplit` to `evaluation.train_score_dataset_split.Common`
* `evaluation.NoSplitDatasetSplit` to `evaluation.no_split_dataset_split.Common`
* `evaluation.FixedSplitDatasetSplit` to `evaluation.fixed_split_dataset_split.Commmon`
* `classifier.RandomForest` to `classification.random_forest.DataFrameCommon`
* `metadata.ComputeMetafeatures` to `metafeature_extraction.compute_metafeatures.Common`
* `audio.CutAudio` to `data_transformation.cut_audio.DataFrameCommon`
* `data.ListToNDArray` to `data_transformation.list_to_ndarray.Common`
* `data.StackNDArrayColumn` to `data_transformation.stack_ndarray_column.Common`
* `data.AddSemanticTypes` to `data_transformation.add_semantic_types.DataFrameCommon`
* `data.RemoveSemanticTypes` to `data_transformation.remove_semantic_types.DataFrameCommon`
* `data.ConstructPredictions` to `data_transformation.construct_predictions.DataFrameCommon`
* `data.ColumnParser` to `data_transformation.column_parser.DataFrameCommon`
* `data.CastToType` to `data_transformation.cast_to_type.Common`
* `data.ExtractColumns` to `data_transformation.extract_columns.DataFrameCommon`
* `data.ExtractColumnsBySemanticTypes` to `data_transformation.extract_columns_by_semantic_types.DataFrameCommon`
* `data.ExtractColumnsByStructuralTypes` to `data_transformation.extract_columns_by_structural_types.DataFrameCommon`
* `data.RemoveColumns` to `data_transformation.remove_columns.DataFrameCommon`
* `data.RemoveDuplicateColumns` to `data_transformation.remove_duplicate_columns.DataFrameCommon`
* `data.HorizontalConcat` to `data_transformation.horizontal_concat.DataFrameConcat`
* `data.DataFrameToNDArray` to `data_transformation.dataframe_to_ndarray.Common`
* `data.NDArrayToDataFrame` to `data_transformation.ndarray_to_dataframe.Common`
* `data.DataFrameToList` to `data_transformation.dataframe_to_list.Common`
* `data.ListToDataFrame` to `data_transformation.list_to_dataframe.Common`
* `data.NDArrayToList` to `data_transformation.ndarray_to_list.Common`
* `data.ReplaceSemanticTypes` to `data_transformation.replace_semantic_types.DataFrameCommon`
* `data.UnseenLabelEncoder` to `data_preprocessing.label_encoder.DataFrameCommon`
* `data.UnseenLabelDecoder` to `data_preprocessing.label_decoder.DataFrameCommon`
* `data.ImageReader` to `data_preprocessing.image_reader.DataFrameCommon`
* `data.TextReader` to `data_preprocessing.text_reader.DataFrameCommon`
* `data.VideoReader` to `data_preprocessing.video_reader.DataFrameCommon`
* `data.CSVReader` to `data_preprocessing.csv_reader.DataFrameCommon`
* `data.AudioReader` to `data_preprocessing.audio_reader.DataFrameCommon`
* `datasets.UpdateSemanticTypes` to `data_transformation.update_semantic_types.DatasetCommon`
* `datasets.RemoveColumns` to `data_transformation.remove_columns.DatasetCommon`
* `datasets.RegexFilter` to `data_preprocessing.regex_filter.DatasetCommon`
* `datasets.TermFilter` to `data_preprocessing.term_filter.DatasetCommon`
* `datasets.NumericRangeFilter` to `data_preprocessing.numeric_range_filter.DatasetCommon`
* `common_primitives.BayesianLogisticRegression` to `classification.bayesian_logistic_regression.Common`
* `common_primitives.ConvolutionalNeuralNet` to `regression.convolutional_neural_net.TorchCommon`
* `common_primitives.DiagonalMVN` to `operator.diagonal_mvn.Common`
* `common_primitives.FeedForwardNeuralNet` to `regression.feed_forward_neural_net.TorchCommon`
* `common_primitives.ImageReader` to `data_preprocessing.image_reader.Common`
* `common_primitives.KMeans` to `clustering.kmeans.Common`
* `common_primitives.LinearRegression` to `regression.linear_regression.Common`
* `common_primitives.Loss` to `regression.loss.TorchCommon`
* `common_primitives.PCA` to `feature_extraction.pca.Common`
* `common_primitives.OneHotMaker` to `data_preprocessing.one_hot_encoder.Common`
* Fixed pickling issue of `classifier.RandomFores`.
[#47](https://gitlab.com/datadrivendiscovery/common-primitives/issues/47)
[!48](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/48)
* `data.ColumnParser` primitive has now additional hyper-parameter `replace_index_columns`
which controls whether index columns are still replaced when otherwise appending returned
parsed columns or not.
* Made `data.RemoveDuplicateColumns` fit and remember duplicate columns during training.
[#45](https://gitlab.com/datadrivendiscovery/common-primitives/issues/45)
* Added `match_logic` hyper-parameter to the `data.ReplaceSemanticTypes` primitive
which allows one to control how multiple specified semantic types match.
* Added new primitives:
* `metadata.ComputeMetafeatures`
* `datasets.RegexFilter`
* `datasets.TermFilter`
* `datasets.NumericRangeFilter`
* `evaluation.NoSplitDatasetSplit`
* `evaluation.FixedSplitDatasetSplit`
* Column parser fixed to parse columns with `http://schema.org/DateTime` semantic type.
* Simplified logic (and made it more predictable) of `combine_columns` utility function when
using `new` `return_result` and `add_index_columns` set to true. Now if output already contains
any index column, input index columns are not added. And if there are no index columns,
all input index columns are added at the beginning.
* Fixed `_can_use_inputs_column` in `classifier.RandomForest`. Added check of structural type, so
only columns with numerical structural types are processed.
* Correctly set column names in `evaluation.ComputeScores` primitive's output.
* Cast indices and columns to match predicted columns' dtypes.
[#33](https://gitlab.com/datadrivendiscovery/common-primitives/issues/33)
* `datasets.DatasetToDataFrame` primitive does not try to generate metadata automatically
because this is not really needed (metadata can just be copied from the dataset). This
speeds up the primitive.
[#34](https://gitlab.com/datadrivendiscovery/common-primitives/issues/34)
* Made it uniform that whenever we are generating lists of all column names
we try first to get the name from the metadata and fallback to one in DataFrame.
Instead of using a column index in the latter case.
* Made splitting primitives, `classifier.RandomForest` and `data.UnseenLabelEncoder`
be picklable even unfitted.
* Fixed entry point for `audio.CutAudio` primitive.

## v0.2.0

* Made those primitives operate on semantic types and support different ways to return results.
* Added or updated many primitives:
* `data.ExtractColumns`
* `data.ExtractColumnsBySemanticTypes`
* `data.ExtractColumnsByStructuralTypes`
* `data.RemoveColumns`
* `data.RemoveDuplicateColumns`
* `data.HorizontalConcat`
* `data.CastToType`
* `data.ColumnParser`
* `data.ConstructPredictions`
* `data.DataFrameToNDArray`
* `data.NDArrayToDataFrame`
* `data.DataFrameToList`
* `data.ListToDataFrame`
* `data.NDArrayToList`
* `data.ListToNDArray`
* `data.StackNDArrayColumn`
* `data.AddSemanticTypes`
* `data.RemoveSemanticTypes`
* `data.ReplaceSemanticTypes`
* `data.UnseenLabelEncoder`
* `data.UnseenLabelDecoder`
* `data.ImageReader`
* `data.TextReader`
* `data.VideoReader`
* `data.CSVReader`
* `data.AudioReader`
* `datasets.Denormalize`
* `datasets.DatasetToDataFrame`
* `datasets.UpdateSemanticTypes`
* `datasets.RemoveColumns`
* `evaluation.RedactTargets`
* `evaluation.ComputeScores`
* `evaluation.KFoldDatasetSplit`
* `evaluation.TrainScoreDatasetSplit`
* `audio.CutAudio`
* `classifier.RandomForest`
* Starting list enabled primitives in the [`entry_points.ini`](./entry_points.ini) file.
* Created `devel` branch which contains primitives coded against the
future release of the `d3m` core package (its `devel` branch).
`master` branch of this repository is made against the latest stable
release of the `d3m` core package.
* Dropped support for Python 2.7 and require Python 3.6.
* Renamed repository and package to `common-primitives` and `common_primitives`,
respectively.
* Repository migrated to gitlab.com and made public.

## v0.1.1

* Made common primitives work on Python 2.7.

## v0.1.0

* Initial set of common primitives.

+ 0
- 94
common-primitives/HOW_TO_MANAGE.md View File

@@ -1,94 +0,0 @@
# How to publish primitive annotations

As contributors add or update their primitives they might want to publish
primitive annotations for added primitives. When doing this it is important
to republish also all other primitive annotations already published from this
package. This is because only one version of the package can be installed at
a time and all primitive annotations have to point to the same package in
their `installation` metadata.

Steps to publish primitive annotations:
* Operate in a virtual env with the following installed:
* Target core package installed.
* [Test primitives](https://gitlab.com/datadrivendiscovery/tests-data/tree/master/primitives)
with the same version of primitives which are currently published in `primitives`
repository. Remember to install them in `-e` editable mode.
* Update `HISTORY.md` for `vNEXT` release with information about primitives
added or updated. If there was no package release since they were updated last,
do not duplicate entries but just update any existing entries for those primitives
instead, so that once released it is clear what has changed in a release as a whole.
* Make sure tests for primitives being published (primitives added, updated,
and primitives previously published which should be now republished) pass.
* Update `entry_points.ini` and add new primitives. Leave active
only those entries for primitives being (re)published and comment out all others.
* If this is the first time primitives are published after a release of a new `d3m`
core package, leave active only those which were updated to work with
the new `d3m` core package. Leave to others to update, verify, and publish
other common primitives.
* In clone of `primitives` repository prepare a branch of the up-to-date `master` branch
to add/update primitive annotations. If existing annotations for common primitives
are already there the best is to first remove them to make sure annotations for
removed primitives do not stay around. We will re-add all primitives in the next step.
* Run `add.sh` in root of this package, which will add primitive annotations
to `primitives`. See instructions in the script for more information.
* Verify changes in the `primitives`, add and commit files to git.
* Publish a branch in `primitives` and make a merge request.

# How to release a new version

A new version is always released from `master` branch against a stable release
of `d3m` core package. A new version should be released when there are major
changes to the package (many new primitives added, larger breaking changes).
Sync up with other developers of the repo to suggest a release, or do a release.

* On `master` branch:
* Make sure `HISTORY.md` file is updated with all changes since the last release.
* Change a version in `common_primitives/__init__.py` to the to-be-released version, without `v` prefix.
* Change `vNEXT` in `HISTORY.md` to the to-be-released version, with `v` prefix.
* Commit with message `Bumping version for release.`
* `git push`
* Wait for CI to run tests successfully.
* Tag with version prefixed with `v`, e.g., for version `0.2.0`: `git tag v0.2.0`
* `git push` & `git push --tags`
* Change a version in `common_primitives/__init__.py` back to `devel` string.
* Add a new empty `vNEXT` version on top of `HISTORY.md`.
* Commit with message `Version bump for development.`
* `git push`
* On `devel` branch:
* Merge `master` into `devel` branch: `git merge master`
* Update the branch according to the section below.
* `git push`

# How to update `master` branch after a release of new `d3m` core package

Hopefully, `devel` branch already contains code which works against the released
`d3m` core package. So merge `devel` branch into `master` branch and update
files according to the following section.

# Keeping `master` and `devel` branches in sync

Because `master` and `devel` branches mostly contain the same code,
just made against different version of `d3m` core package, it is common
to merge branches into each other as needed to keep them in sync.
When doing so, the following are files which are specific to branches:

* `.gitlab-ci.yml` has a `DEPENDENCY_REF` environment variable which
has to point to `master` on `master` branch of this repository,
and `devel` on `devel` branch of this repository.

# How to add an example pipeline

Every common primitive (except those used in non-standard pipelines, like splitting primitives)
should have at least one example pipeline and associated pipeline run.

Add example pipelines into a corresponding sub-directory based on primitive's suffix into `pipelines`
directory in the repository. If a pipeline uses multiple common primitives, add it for only one
primitive and create symbolic links for other primitives.

Create a `fit-score` pipeline run as [described in primitives index repository](https://gitlab.com/datadrivendiscovery/primitives#adding-a-primitive).
Compress it with `gzip` and store it under `pipeline_runs` directory in the repository.
Similarly, add it only for one primitive and create symbolic links for others, if pipeline run
corresponds to a pipeline with multiple common primitives.

Use `git-add.sh` script to assure all files larger than 100 KB are added as git LFS files to
the repository.

+ 0
- 201
common-primitives/LICENSE.txt View File

@@ -1,201 +0,0 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/

TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

1. Definitions.

"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.

"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.

"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.

"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.

"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.

"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.

"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).

"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.

"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."

"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.

2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.

3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.

4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:

(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and

(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and

(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and

(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.

You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.

5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.

6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.

7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.

8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.

9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.

END OF TERMS AND CONDITIONS

APPENDIX: How to apply the Apache License to your work.

To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

+ 0
- 2
common-primitives/MANIFEST.in View File

@@ -1,2 +0,0 @@
include README.md
include LICENSE.txt

+ 0
- 83
common-primitives/README.md View File

@@ -1,83 +0,0 @@
# Common D3M primitives

A common set of primitives for D3M project, maintained together.
It contains example primitives, various glue primitives, and other primitives performers
contributed.

## Installation

This package works on Python 3.6+ and pip 19+.

This package additional dependencies which are specified in primitives' metadata,
but if you are manually installing the package, you have to first run, for Ubuntu:

```
$ apt-get install build-essential libopenblas-dev libcap-dev ffmpeg
$ pip3 install python-prctl
```

To install common primitives from inside a cloned repository, run:

```
$ pip3 install -e .
```

When cloning a repository, clone it recursively to get also git submodules:

```
$ git clone --recursive https://gitlab.com/datadrivendiscovery/common-primitives.git
```

## Changelog

See [HISTORY.md](./HISTORY.md) for summary of changes to this package.

## Repository structure

`master` branch contains latest code of common primitives made against the latest stable
release of the [`d3m` core package](https://gitlab.com/datadrivendiscovery/d3m) (its `master` branch).
`devel` branch contains latest code of common primitives made against the
future release of the `d3m` core package (its `devel` branch).

Releases are [tagged](https://gitlab.com/datadrivendiscovery/d3m/tags) but they are not done
regularly. Each primitive has its own versions as well, which are not related to package versions.
Generally is the best to just use the latest code available in `master` or `devel`
branches (depending which version of the core package you are using).

## Testing locally

For each commit to this repository, tests run automatically in the
[GitLab CI](https://gitlab.com/datadrivendiscovery/common-primitives/pipelines).

If you don't want to wait for the GitLab CI test results and run the tests locally,
you can install and use the [GitLab runner](https://docs.gitlab.com/runner/install/) in your system.

With the local GitLab runner, you can run the tests defined in the [.gitlab-ci.yml](.gitlab-ci.yml)
file of this repository, such as:

```
$ gitlab-runner exec docker style_check
$ gitlab-runner exec docker type_check
```

You can also just try to run tests available under `/tests` by running:

```
$ python3 run_tests.py
```

## Contribute

Feel free to contribute more primitives to this repository. The idea is that we build
a common set of primitives which can help both as an example, but also to have shared
maintenance of some primitives, especially glue primitives.

All primitives are written in Python 3 and are type checked using
[mypy](http://www.mypy-lang.org/), so typing annotations are required.

## About Data Driven Discovery Program

DARPA Data Driven Discovery (D3M) Program is researching ways to get machines to build
machine learning pipelines automatically. It is split into three layers:
TA1 (primitives), TA2 (systems which combine primitives automatically into pipelines
and executes them), and TA3 (end-users interfaces).

+ 0
- 24
common-primitives/add.sh View File

@@ -1,24 +0,0 @@
#!/bin/bash -e

# Assumption is that this repository is cloned into "common-primitives" directory
# which is a sibling of "d3m-primitives" directory with D3M public primitives.

D3M_VERSION="$(python3 -c 'import d3m; print(d3m.__version__)')"

for PRIMITIVE_SUFFIX in $(./list_primitives.py --suffix); do
echo "$PRIMITIVE_SUFFIX"
python3 -m d3m index describe -i 4 "d3m.primitives.$PRIMITIVE_SUFFIX" > primitive.json
pushd ../d3m-primitives > /dev/null
./add.py ../common-primitives/primitive.json
popd > /dev/null
if [[ -e "pipelines/$PRIMITIVE_SUFFIX" ]]; then
PRIMITIVE_PATH="$(echo ../d3m-primitives/v$D3M_VERSION/common-primitives/d3m.primitives.$PRIMITIVE_SUFFIX/*)"
mkdir -p "$PRIMITIVE_PATH/pipelines"
find pipelines/$PRIMITIVE_SUFFIX/ \( -name '*.json' -or -name '*.yaml' -or -name '*.yml' -or -name '*.json.gz' -or -name '*.yaml.gz' -or -name '*.yml.gz' \) -exec cp '{}' "$PRIMITIVE_PATH/pipelines" ';'
fi
if [[ -e "pipeline_runs/$PRIMITIVE_SUFFIX" ]]; then
PRIMITIVE_PATH="$(echo ../d3m-primitives/v$D3M_VERSION/common-primitives/d3m.primitives.$PRIMITIVE_SUFFIX/*)"
mkdir -p "$PRIMITIVE_PATH/pipeline_runs"
find pipeline_runs/$PRIMITIVE_SUFFIX/ \( -name '*.yml.gz' -or -name '*.yaml.gz' \) -exec cp '{}' "$PRIMITIVE_PATH/pipeline_runs" ';'
fi
done

+ 0
- 63
common-primitives/entry_points.ini View File

@@ -1,63 +0,0 @@
[d3m.primitives]
data_preprocessing.one_hot_encoder.MakerCommon = common_primitives.one_hot_maker:OneHotMakerPrimitive
data_preprocessing.one_hot_encoder.PandasCommon = common_primitives.pandas_onehot_encoder:PandasOneHotEncoderPrimitive
data_transformation.extract_columns.Common = common_primitives.extract_columns:ExtractColumnsPrimitive
data_transformation.extract_columns_by_semantic_types.Common = common_primitives.extract_columns_semantic_types:ExtractColumnsBySemanticTypesPrimitive
data_transformation.extract_columns_by_structural_types.Common = common_primitives.extract_columns_structural_types:ExtractColumnsByStructuralTypesPrimitive
data_transformation.remove_columns.Common = common_primitives.remove_columns:RemoveColumnsPrimitive
data_transformation.remove_duplicate_columns.Common = common_primitives.remove_duplicate_columns:RemoveDuplicateColumnsPrimitive
data_transformation.horizontal_concat.DataFrameCommon = common_primitives.horizontal_concat:HorizontalConcatPrimitive
data_transformation.cast_to_type.Common = common_primitives.cast_to_type:CastToTypePrimitive
data_transformation.column_parser.Common = common_primitives.column_parser:ColumnParserPrimitive
data_transformation.construct_predictions.Common = common_primitives.construct_predictions:ConstructPredictionsPrimitive
data_transformation.dataframe_to_ndarray.Common = common_primitives.dataframe_to_ndarray:DataFrameToNDArrayPrimitive
data_transformation.ndarray_to_dataframe.Common = common_primitives.ndarray_to_dataframe:NDArrayToDataFramePrimitive
data_transformation.dataframe_to_list.Common = common_primitives.dataframe_to_list:DataFrameToListPrimitive
data_transformation.list_to_dataframe.Common = common_primitives.list_to_dataframe:ListToDataFramePrimitive
data_transformation.ndarray_to_list.Common = common_primitives.ndarray_to_list:NDArrayToListPrimitive
data_transformation.list_to_ndarray.Common = common_primitives.list_to_ndarray:ListToNDArrayPrimitive
data_transformation.stack_ndarray_column.Common = common_primitives.stack_ndarray_column:StackNDArrayColumnPrimitive
data_transformation.add_semantic_types.Common = common_primitives.add_semantic_types:AddSemanticTypesPrimitive
data_transformation.remove_semantic_types.Common = common_primitives.remove_semantic_types:RemoveSemanticTypesPrimitive
data_transformation.replace_semantic_types.Common = common_primitives.replace_semantic_types:ReplaceSemanticTypesPrimitive
data_transformation.denormalize.Common = common_primitives.denormalize:DenormalizePrimitive
data_transformation.datetime_field_compose.Common = common_primitives.datetime_field_compose:DatetimeFieldComposePrimitive
data_transformation.grouping_field_compose.Common = common_primitives.grouping_field_compose:GroupingFieldComposePrimitive
data_transformation.dataset_to_dataframe.Common = common_primitives.dataset_to_dataframe:DatasetToDataFramePrimitive
data_transformation.cut_audio.Common = common_primitives.cut_audio:CutAudioPrimitive
data_transformation.rename_duplicate_name.DataFrameCommon = common_primitives.rename_duplicate_columns:RenameDuplicateColumnsPrimitive
#data_transformation.normalize_column_references.Common = common_primitives.normalize_column_references:NormalizeColumnReferencesPrimitive
#data_transformation.normalize_graphs.Common = common_primitives.normalize_graphs:NormalizeGraphsPrimitive
data_transformation.ravel.DataFrameRowCommon = common_primitives.ravel:RavelAsRowPrimitive
data_preprocessing.label_encoder.Common = common_primitives.unseen_label_encoder:UnseenLabelEncoderPrimitive
data_preprocessing.label_decoder.Common = common_primitives.unseen_label_decoder:UnseenLabelDecoderPrimitive
data_preprocessing.image_reader.Common = common_primitives.dataframe_image_reader:DataFrameImageReaderPrimitive
data_preprocessing.text_reader.Common = common_primitives.text_reader:TextReaderPrimitive
data_preprocessing.video_reader.Common = common_primitives.video_reader:VideoReaderPrimitive
data_preprocessing.csv_reader.Common = common_primitives.csv_reader:CSVReaderPrimitive
data_preprocessing.audio_reader.Common = common_primitives.audio_reader:AudioReaderPrimitive
data_preprocessing.regex_filter.Common = common_primitives.regex_filter:RegexFilterPrimitive
data_preprocessing.term_filter.Common = common_primitives.term_filter:TermFilterPrimitive
data_preprocessing.numeric_range_filter.Common = common_primitives.numeric_range_filter:NumericRangeFilterPrimitive
data_preprocessing.datetime_range_filter.Common = common_primitives.datetime_range_filter:DatetimeRangeFilterPrimitive
data_preprocessing.dataset_sample.Common = common_primitives.dataset_sample:DatasetSamplePrimitive
#data_preprocessing.time_interval_transform.Common = common_primitives.time_interval_transform:TimeIntervalTransformPrimitive
data_cleaning.tabular_extractor.Common = common_primitives.tabular_extractor:AnnotatedTabularExtractorPrimitive
evaluation.redact_columns.Common = common_primitives.redact_columns:RedactColumnsPrimitive
evaluation.kfold_dataset_split.Common = common_primitives.kfold_split:KFoldDatasetSplitPrimitive
evaluation.kfold_time_series_split.Common = common_primitives.kfold_split_timeseries:KFoldTimeSeriesSplitPrimitive
evaluation.train_score_dataset_split.Common = common_primitives.train_score_split:TrainScoreDatasetSplitPrimitive
evaluation.no_split_dataset_split.Common = common_primitives.no_split:NoSplitDatasetSplitPrimitive
evaluation.fixed_split_dataset_split.Commmon = common_primitives.fixed_split:FixedSplitDatasetSplitPrimitive
classification.random_forest.Common = common_primitives.random_forest:RandomForestClassifierPrimitive
classification.light_gbm.Common = common_primitives.lgbm_classifier:LightGBMClassifierPrimitive
classification.xgboost_gbtree.Common = common_primitives.xgboost_gbtree:XGBoostGBTreeClassifierPrimitive
classification.xgboost_dart.Common = common_primitives.xgboost_dart:XGBoostDartClassifierPrimitive
regression.xgboost_gbtree.Common = common_primitives.xgboost_regressor:XGBoostGBTreeRegressorPrimitive
schema_discovery.profiler.Common = common_primitives.simple_profiler:SimpleProfilerPrimitive
operator.column_map.Common = common_primitives.column_map:DataFrameColumnMapPrimitive
operator.dataset_map.DataFrameCommon = common_primitives.dataset_map:DataFrameDatasetMapPrimitive
data_preprocessing.flatten.DataFrameCommon = common_primitives.dataframe_flatten:DataFrameFlattenPrimitive
metalearning.metafeature_extractor.Common = common_primitives.compute_metafeatures:ComputeMetafeaturesPrimitive
data_augmentation.datamart_augmentation.Common = common_primitives.datamart_augment:DataMartAugmentPrimitive
data_augmentation.datamart_download.Common = common_primitives.datamart_download:DataMartDownloadPrimitive

+ 0
- 5
common-primitives/git-add.sh View File

@@ -1,5 +0,0 @@
#!/bin/bash -e

# This requires git LFS 2.9.0 or newer.

find * -type f -size +100k -exec git lfs track --filename '{}' +

+ 0
- 21
common-primitives/git-check.sh View File

@@ -1,21 +0,0 @@
#!/bin/bash -e

if git rev-list --objects --all \
| git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' \
| sed -n 's/^blob //p' \
| awk '$2 >= 100*(2^10)' \
| awk '{print $3}' \
| egrep -v '(^|/).gitattributes$' ; then
echo "Repository contains committed objects larger than 100 KB."
exit 1
fi

if git lfs ls-files --name-only | xargs -r stat -c '%s %n' | awk '$1 < 100*(2^10)' | awk '{print $2}' | grep . ; then
echo "Repository contains LFS objects smaller than 100 KB."
exit 1
fi

if git lfs ls-files --name-only | xargs -r stat -c '%s %n' | awk '$1 >= 2*(2^30)' | awk '{print $2}' | grep . ; then
echo "Repository contains LFS objects not smaller than 2 GB."
exit 1
fi

+ 0
- 32
common-primitives/list_primitives.py View File

@@ -1,32 +0,0 @@
#!/usr/bin/env python3

import argparse
import configparser
import re


class CaseSensitiveConfigParser(configparser.ConfigParser):
optionxform = staticmethod(str)


parser = argparse.ArgumentParser(description='List enabled common primitives.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--suffix', action='store_true', help='list primitive suffixes of all enabled common primitives')
group.add_argument('--python', action='store_true', help='list Python paths of all enabled common primitives')
group.add_argument('--files', action='store_true', help='list file paths of all enabled common primitives')

args = parser.parse_args()

entry_points = CaseSensitiveConfigParser()
entry_points.read('entry_points.ini')

for primitive_suffix, primitive_path in entry_points.items('d3m.primitives'):
if args.python:
print("d3m.primitives.{primitive_suffix}".format(primitive_suffix=primitive_suffix))
elif args.suffix:
print(primitive_suffix)
elif args.files:
primitive_path = re.sub(':.+$', '', primitive_path)
primitive_path = re.sub('\.', '/', primitive_path)
print("{primitive_path}.py".format(primitive_path=primitive_path))


BIN
common-primitives/pipeline_runs/classification.light_gbm.DataFrameCommon/1.yaml.gz View File


BIN
common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/1.yaml.gz View File


+ 0
- 1
common-primitives/pipeline_runs/classification.random_forest.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

BIN
common-primitives/pipeline_runs/classification.xgboost_dart.DataFrameCommon/1.yaml.gz View File


BIN
common-primitives/pipeline_runs/classification.xgboost_gbtree.DataFrameCommon/1.yaml.gz View File


BIN
common-primitives/pipeline_runs/data_augmentation.datamart_augmentation.Common/2.yaml.gz View File


BIN
common-primitives/pipeline_runs/data_preprocessing.dataset_sample.Common/1.yaml.gz View File


+ 0
- 1
common-primitives/pipeline_runs/data_preprocessing.one_hot_encoder.PandasCommon/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/1.yaml.gz View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/1.yaml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.column_parser.DataFrameCommon/pipeline_run_group_field_compose.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/1.yaml.gz View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/1.yaml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.construct_predictions.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/1.yaml.gz View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/1.yaml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.dataset_to_dataframe.Common/pipeline_run_group_field_compose.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/1.yaml.gz View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/1.yaml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

BIN
common-primitives/pipeline_runs/data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz View File


BIN
common-primitives/pipeline_runs/data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz View File


+ 0
- 1
common-primitives/pipeline_runs/data_transformation.horizontal_concat.DataFrameConcat/1.yaml.gz View File

@@ -1 +0,0 @@
../data_preprocessing.one_hot_encoder.MakerCommon/1.yaml.gz

+ 0
- 1
common-primitives/pipeline_runs/data_transformation.remove_columns.Common/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

+ 0
- 4729
common-primitives/pipeline_runs/regression.xgboost_gbtree.DataFrameCommon/1.yml
File diff suppressed because it is too large
View File


+ 0
- 1
common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_extract_structural_types.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz

+ 0
- 1
common-primitives/pipeline_runs/schema_discovery.profiler.Common/pipeline_run_group_field_compose.yml.gz View File

@@ -1 +0,0 @@
../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz

+ 0
- 246
common-primitives/pipelines/classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json View File

@@ -1,246 +0,0 @@
{
"context": "TESTING",
"created": "2019-02-12T01:09:44.343543Z",
"id": "d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde",
"inputs": [
{
"name": "inputs"
}
],
"outputs": [
{
"data": "steps.7.produce",
"name": "output predictions"
}
],
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"steps": [
{
"arguments": {
"inputs": {
"data": "inputs.0",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"name": "Extract a DataFrame from a Dataset",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"parse_semantic_types": {
"data": [
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector",
"http://schema.org/DateTime"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"name": "Parses strings into their types",
"python_path": "d3m.primitives.data_transformation.column_parser.Common",
"version": "0.6.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/CategoricalData"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"exclude_columns": {
"data": [
0
],
"type": "VALUE"
},
"semantic_types": {
"data": [
"http://schema.org/Integer",
"http://schema.org/Float"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.3.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
},
"use_semantic_types": {
"data": true,
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde",
"name": "sklearn.impute.SimpleImputer",
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.5.produce",
"type": "CONTAINER"
},
"outputs": {
"data": "steps.4.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "259aa747-795c-435e-8e33-8c32a4c83c6b",
"name": "LightGBM GBTree classifier",
"python_path": "d3m.primitives.classification.light_gbm.Common",
"version": "0.1.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.6.produce",
"type": "CONTAINER"
},
"reference": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "8d38b340-f83f-4877-baaa-162f8e551736",
"name": "Construct pipeline predictions output",
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
}
]
}

+ 0
- 1
common-primitives/pipelines/classification.random_forest.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 110
common-primitives/pipelines/classification.random_forest.DataFrameCommon/ccad0f9c-130e-4063-a91e-ea65a18cb041.yaml View File

@@ -1,110 +0,0 @@
id: ccad0f9c-130e-4063-a91e-ea65a18cb041
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
source:
name: Mitar
created: "2019-06-05T11:48:52.806069Z"
context: TESTING
name: Random Forest classifier pipeline
description: |
A simple pipeline which runs Random Forest classifier on tabular data.
inputs:
- name: input dataset
outputs:
- name: predictions
data: steps.5.produce
steps:
# Step 0.
- type: PRIMITIVE
primitive:
id: f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e
version: 0.2.0
python_path: d3m.primitives.data_transformation.denormalize.Common
name: Denormalize datasets
arguments:
inputs:
type: CONTAINER
data: inputs.0
outputs:
- id: produce
# Step 1.
- type: PRIMITIVE
primitive:
id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65
version: 0.3.0
python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common
name: Extract a DataFrame from a Dataset
arguments:
inputs:
type: CONTAINER
data: steps.0.produce
outputs:
- id: produce
# Step 2.
- type: PRIMITIVE
primitive:
id: d510cb7a-1782-4f51-b44c-58f0236e47c7
version: 0.6.0
python_path: d3m.primitives.data_transformation.column_parser.Common
name: Parses strings into their types
arguments:
inputs:
type: CONTAINER
data: steps.1.produce
outputs:
- id: produce
# Step 3.
- type: PRIMITIVE
primitive:
id: d016df89-de62-3c53-87ed-c06bb6a23cde
version: 2019.6.7
python_path: d3m.primitives.data_cleaning.imputer.SKlearn
name: sklearn.impute.SimpleImputer
arguments:
inputs:
type: CONTAINER
data: steps.2.produce
outputs:
- id: produce
hyperparams:
use_semantic_types:
type: VALUE
data: true
return_result:
type: VALUE
data: replace
# Step 4.
- type: PRIMITIVE
primitive:
id: 37c2b19d-bdab-4a30-ba08-6be49edcc6af
version: 0.4.0
python_path: d3m.primitives.classification.random_forest.Common
name: Random forest classifier
arguments:
inputs:
type: CONTAINER
data: steps.3.produce
outputs:
type: CONTAINER
data: steps.3.produce
outputs:
- id: produce
hyperparams:
return_result:
type: VALUE
data: replace
# Step 5.
- type: PRIMITIVE
primitive:
id: 8d38b340-f83f-4877-baaa-162f8e551736
version: 0.3.0
python_path: d3m.primitives.data_transformation.construct_predictions.Common
name: Construct pipeline predictions output
arguments:
inputs:
type: CONTAINER
data: steps.4.produce
reference:
type: CONTAINER
data: steps.2.produce
outputs:
- id: produce

+ 0
- 246
common-primitives/pipelines/classification.xgboost_dart.DataFrameCommon/b7a24816-2518-4073-9c45-b97f2b2fee30.json View File

@@ -1,246 +0,0 @@
{
"context": "TESTING",
"created": "2019-02-12T01:33:29.921236Z",
"id": "b7a24816-2518-4073-9c45-b97f2b2fee30",
"inputs": [
{
"name": "inputs"
}
],
"outputs": [
{
"data": "steps.7.produce",
"name": "output predictions"
}
],
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"steps": [
{
"arguments": {
"inputs": {
"data": "inputs.0",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"name": "Extract a DataFrame from a Dataset",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"parse_semantic_types": {
"data": [
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector",
"http://schema.org/DateTime"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"name": "Parses strings into their types",
"python_path": "d3m.primitives.data_transformation.column_parser.Common",
"version": "0.6.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/CategoricalData"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"exclude_columns": {
"data": [
0
],
"type": "VALUE"
},
"semantic_types": {
"data": [
"http://schema.org/Integer",
"http://schema.org/Float"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.3.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
},
"use_semantic_types": {
"data": true,
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde",
"name": "sklearn.impute.SimpleImputer",
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.5.produce",
"type": "CONTAINER"
},
"outputs": {
"data": "steps.4.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "7476950e-4373-4cf5-a852-7e16afb8e098",
"name": "XGBoost DART classifier",
"python_path": "d3m.primitives.classification.xgboost_dart.Common",
"version": "0.1.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.6.produce",
"type": "CONTAINER"
},
"reference": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "8d38b340-f83f-4877-baaa-162f8e551736",
"name": "Construct pipeline predictions output",
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
}
]
}

+ 0
- 246
common-primitives/pipelines/classification.xgboost_gbtree.DataFrameCommon/4d402450-2562-48cc-93fd-719fb658c43c.json View File

@@ -1,246 +0,0 @@
{
"context": "TESTING",
"created": "2019-02-12T01:18:47.753202Z",
"id": "4d402450-2562-48cc-93fd-719fb658c43c",
"inputs": [
{
"name": "inputs"
}
],
"outputs": [
{
"data": "steps.7.produce",
"name": "output predictions"
}
],
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"steps": [
{
"arguments": {
"inputs": {
"data": "inputs.0",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"name": "Extract a DataFrame from a Dataset",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"parse_semantic_types": {
"data": [
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector",
"http://schema.org/DateTime"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"name": "Parses strings into their types",
"python_path": "d3m.primitives.data_transformation.column_parser.Common",
"version": "0.6.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/CategoricalData"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"exclude_columns": {
"data": [
0
],
"type": "VALUE"
},
"semantic_types": {
"data": [
"http://schema.org/Integer",
"http://schema.org/Float"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.3.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
},
"use_semantic_types": {
"data": true,
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde",
"name": "sklearn.impute.SimpleImputer",
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.5.produce",
"type": "CONTAINER"
},
"outputs": {
"data": "steps.4.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "fe0841b7-6e70-4bc3-a56c-0670a95ebc6a",
"name": "XGBoost GBTree classifier",
"python_path": "d3m.primitives.classification.xgboost_gbtree.Common",
"version": "0.1.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.6.produce",
"type": "CONTAINER"
},
"reference": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "8d38b340-f83f-4877-baaa-162f8e551736",
"name": "Construct pipeline predictions output",
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
}
]
}

+ 0
- 522
common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/3afd2bd2-7ba1-4ac1-928f-fad0c39a05e5.json
File diff suppressed because it is too large
View File


+ 0
- 342
common-primitives/pipelines/data_augmentation.datamart_augmentation.Common/4ff2f21d-1bba-4c44-bb96-e05728bcf6ed.json View File

@@ -1,342 +0,0 @@
{
"id": "4ff2f21d-1bba-4c44-bb96-e05728bcf6ed",
"name": "classification_template(imputer=d3m.primitives.data_cleaning.imputer.SKlearn, classifier=d3m.primitives.regression.random_forest.SKlearn)",
"description": "To be used with NYU datamart.",
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"created": "2019-06-06T21:30:30Z",
"context": "TESTING",
"inputs": [
{
"name": "input dataset"
}
],
"outputs": [
{
"data": "steps.12.produce",
"name": "predictions"
}
],
"steps": [
{
"type": "PRIMITIVE",
"primitive": {
"id": "fe0f1ac8-1d39-463a-b344-7bd498a31b91",
"version": "0.1",
"name": "Perform dataset augmentation using Datamart",
"python_path": "d3m.primitives.data_augmentation.datamart_augmentation.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "inputs.0"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"system_identifier": {
"type": "VALUE",
"data": "NYU"
},
"search_result": {
"type": "VALUE",
"data": "{\"augmentation\": {\"left_columns\": [[1]], \"left_columns_names\": [\"tpep_pickup_datetime\"], \"right_columns\": [[0]], \"type\": \"join\"}, \"id\": \"datamart.url.a3943fd7892d5d219012f889327c6661\", \"metadata\": {\"columns\": [{\"coverage\": [{\"range\": {\"gte\": 1451610000.0, \"lte\": 1540252800.0}}], \"mean\": 1495931400.0, \"name\": \"DATE\", \"semantic_types\": [\"http://schema.org/DateTime\"], \"stddev\": 25590011.431395352, \"structural_type\": \"http://schema.org/Text\"}, {\"name\": \"HOURLYSKYCONDITIONS\", \"semantic_types\": [], \"structural_type\": \"http://schema.org/Text\"}, {\"coverage\": [{\"range\": {\"gte\": -17.2, \"lte\": 37.8}}], \"mean\": 14.666224009096823, \"name\": \"HOURLYDRYBULBTEMPC\", \"semantic_types\": [], \"stddev\": 9.973788193915643, \"structural_type\": \"http://schema.org/Float\"}, {\"coverage\": [{\"range\": {\"gte\": 11.0, \"lte\": 100.0}}], \"mean\": 60.70849577647823, \"name\": \"HOURLYRelativeHumidity\", \"semantic_types\": [], \"stddev\": 18.42048051096981, \"structural_type\": \"http://schema.org/Float\"}, {\"coverage\": [{\"range\": {\"gte\": 0.0, \"lte\": 41.0}}], \"mean\": 10.68859649122807, \"name\": \"HOURLYWindSpeed\", \"semantic_types\": [], \"stddev\": 5.539675475162907, \"structural_type\": \"http://schema.org/Float\"}, {\"name\": \"HOURLYWindDirection\", \"semantic_types\": [], \"structural_type\": \"http://schema.org/Text\"}, {\"coverage\": [{\"range\": {\"gte\": 28.89, \"lte\": 30.81}}], \"mean\": 29.90760315139694, \"name\": \"HOURLYStationPressure\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/PhoneNumber\"], \"stddev\": 0.24584097919742368, \"structural_type\": \"http://schema.org/Float\"}], \"date\": \"2019-01-22T01:54:58.281183Z\", \"description\": \"This data contains weather information for NY city around LaGuardia Airport from 2016 to 2018; weath...\", \"materialize\": {\"direct_url\": \"https://drive.google.com/uc?export=download&id=1jRwzZwEGMICE3n6-nwmVxMD2c0QCHad4\", \"identifier\": \"datamart.url\"}, \"name\": \"Newyork Weather Data around Airport 2016-18\", \"nb_rows\": 24624, \"size\": 1523693}, \"score\": 1.0, \"supplied_id\": \"DA_ny_taxi_demand_dataset_TRAIN\", \"supplied_resource_id\": \"learningData\"}"
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e",
"version": "0.2.0",
"name": "Denormalize datasets",
"python_path": "d3m.primitives.data_transformation.denormalize.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.0.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"version": "0.3.0",
"name": "Extract a DataFrame from a Dataset",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.1.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"version": "0.6.0",
"name": "Parses strings into their types",
"python_path": "d3m.primitives.data_transformation.column_parser.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.2.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"version": "0.3.0",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.3.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"semantic_types": {
"type": "VALUE",
"data": [
"https://metadata.datadrivendiscovery.org/types/Attribute"
]
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"name": "sklearn.impute.SimpleImputer",
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn",
"version": "2019.11.13",
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.4.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"strategy": {
"type": "VALUE",
"data": "most_frequent"
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"name": "sklearn.preprocessing.data.OneHotEncoder",
"python_path": "d3m.primitives.data_transformation.one_hot_encoder.SKlearn",
"version": "2019.11.13",
"id": "c977e879-1bf5-3829-b5b0-39b00233aff5"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.5.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"handle_unknown": {
"type": "VALUE",
"data": "ignore"
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "eb5fe752-f22a-4090-948b-aafcef203bf5",
"version": "0.2.0",
"name": "Casts DataFrame",
"python_path": "d3m.primitives.data_transformation.cast_to_type.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.6.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"type_to_cast": {
"type": "VALUE",
"data": "float"
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"version": "0.3.0",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.3.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"semantic_types": {
"type": "VALUE",
"data": [
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
]
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "eb5fe752-f22a-4090-948b-aafcef203bf5",
"version": "0.2.0",
"name": "Casts DataFrame",
"python_path": "d3m.primitives.data_transformation.cast_to_type.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.8.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
"name": "sklearn.ensemble.forest.RandomForestRegressor",
"python_path": "d3m.primitives.regression.random_forest.SKlearn",
"version": "2019.11.13",
"id": "f0fd7a62-09b5-3abc-93bb-f5f999f7cc80"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.7.produce"
},
"outputs": {
"type": "CONTAINER",
"data": "steps.9.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"version": "0.3.0",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.3.produce"
}
},
"outputs": [
{
"id": "produce"
}
],
"hyperparams": {
"semantic_types": {
"type": "VALUE",
"data": [
"https://metadata.datadrivendiscovery.org/types/Target",
"https://metadata.datadrivendiscovery.org/types/PrimaryKey"
]
}
}
},
{
"type": "PRIMITIVE",
"primitive": {
"id": "8d38b340-f83f-4877-baaa-162f8e551736",
"version": "0.3.0",
"name": "Construct pipeline predictions output",
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common"
},
"arguments": {
"inputs": {
"type": "CONTAINER",
"data": "steps.10.produce"
},
"reference": {
"type": "CONTAINER",
"data": "steps.11.produce"
}
},
"outputs": [
{
"id": "produce"
}
]
}
]
}

+ 0
- 123
common-primitives/pipelines/data_preprocessing.dataset_sample.Common/387d432a-9893-4558-b190-1c5e9e399dbf.yaml View File

@@ -1,123 +0,0 @@
id: 387d432a-9893-4558-b190-1c5e9e399dbf
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
source:
name: Jeffrey Gleason
created: "2019-06-05T2:48:52.806069Z"
context: TESTING
name: Dataset sample test pipeline
description: |
A simple pipeline which runs Random Forest classifier on tabular data after sampling the dataset (50% of rows)
inputs:
- name: input dataset
outputs:
- name: predictions
data: steps.6.produce
steps:
# Step 0.
- type: PRIMITIVE
primitive:
id: 268315c1-7549-4aee-a4cc-28921cba74c0
version: 0.1.0
python_path: d3m.primitives.data_preprocessing.dataset_sample.Common
name: Dataset sampling primitive
arguments:
inputs:
type: CONTAINER
data: inputs.0
outputs:
- id: produce
# Step 1.
- type: PRIMITIVE
primitive:
id: f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e
version: 0.2.0
python_path: d3m.primitives.data_transformation.denormalize.Common
name: Denormalize datasets
arguments:
inputs:
type: CONTAINER
data: steps.0.produce
outputs:
- id: produce
# Step 2.
- type: PRIMITIVE
primitive:
id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65
version: 0.3.0
python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common
name: Extract a DataFrame from a Dataset
arguments:
inputs:
type: CONTAINER
data: steps.1.produce
outputs:
- id: produce
# Step 3.
- type: PRIMITIVE
primitive:
id: d510cb7a-1782-4f51-b44c-58f0236e47c7
version: 0.6.0
python_path: d3m.primitives.data_transformation.column_parser.Common
name: Parses strings into their types
arguments:
inputs:
type: CONTAINER
data: steps.2.produce
outputs:
- id: produce
# Step 4.
- type: PRIMITIVE
primitive:
id: d016df89-de62-3c53-87ed-c06bb6a23cde
version: 2019.6.7
python_path: d3m.primitives.data_cleaning.imputer.SKlearn
name: sklearn.impute.SimpleImputer
arguments:
inputs:
type: CONTAINER
data: steps.3.produce
outputs:
- id: produce
hyperparams:
use_semantic_types:
type: VALUE
data: true
return_result:
type: VALUE
data: replace
# Step 5.
- type: PRIMITIVE
primitive:
id: 37c2b19d-bdab-4a30-ba08-6be49edcc6af
version: 0.4.0
python_path: d3m.primitives.classification.random_forest.Common
name: Random forest classifier
arguments:
inputs:
type: CONTAINER
data: steps.4.produce
outputs:
type: CONTAINER
data: steps.4.produce
outputs:
- id: produce
hyperparams:
return_result:
type: VALUE
data: replace
# Step 6.
- type: PRIMITIVE
primitive:
id: 8d38b340-f83f-4877-baaa-162f8e551736
version: 0.3.0
python_path: d3m.primitives.data_transformation.construct_predictions.Common
name: Construct pipeline predictions output
arguments:
inputs:
type: CONTAINER
data: steps.5.produce
reference:
type: CONTAINER
data: steps.3.produce
outputs:
- id: produce

+ 0
- 300
common-primitives/pipelines/data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json View File

@@ -1,300 +0,0 @@
{
"context": "TESTING",
"created": "2019-02-12T02:10:00.929519Z",
"id": "2b307634-f01e-412e-8d95-7e54afd4731f",
"inputs": [
{
"name": "inputs"
}
],
"outputs": [
{
"data": "steps.9.produce",
"name": "output predictions"
}
],
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"steps": [
{
"arguments": {
"inputs": {
"data": "inputs.0",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"name": "Extract a DataFrame from a Dataset",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"parse_semantic_types": {
"data": [
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector",
"http://schema.org/DateTime"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"name": "Parses strings into their types",
"python_path": "d3m.primitives.data_transformation.column_parser.Common",
"version": "0.6.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/CategoricalData"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"exclude_columns": {
"data": [
0
],
"type": "VALUE"
},
"semantic_types": {
"data": [
"http://schema.org/Integer",
"http://schema.org/Float"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.3.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
},
"use_semantic_types": {
"data": true,
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde",
"name": "sklearn.impute.SimpleImputer",
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.2.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "eaec420d-46eb-4ddf-a2cd-b8097345ff3e",
"name": "One-hot maker",
"python_path": "d3m.primitives.data_preprocessing.one_hot_encoder.MakerCommon",
"version": "0.2.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"left": {
"data": "steps.6.produce",
"type": "CONTAINER"
},
"right": {
"data": "steps.5.produce",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "aff6a77a-faa0-41c5-9595-de2e7f7c4760",
"name": "Concatenate two dataframes",
"python_path": "d3m.primitives.data_transformation.horizontal_concat.DataFrameCommon",
"version": "0.2.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.7.produce",
"type": "CONTAINER"
},
"outputs": {
"data": "steps.4.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
},
"use_semantic_types": {
"data": true,
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "1dd82833-5692-39cb-84fb-2455683075f3",
"name": "sklearn.ensemble.forest.RandomForestClassifier",
"python_path": "d3m.primitives.classification.random_forest.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.8.produce",
"type": "CONTAINER"
},
"reference": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "8d38b340-f83f-4877-baaa-162f8e551736",
"name": "Construct pipeline predictions output",
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
}
]
}

+ 0
- 1
common-primitives/pipelines/data_preprocessing.one_hot_encoder.PandasCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 1
common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json

+ 0
- 1
common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/a8c40699-c48d-4f12-aa18-639c5fb6baae.json View File

@@ -1 +0,0 @@
../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json

+ 0
- 1
common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 1
common-primitives/pipelines/data_transformation.column_parser.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json

+ 0
- 1
common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/4ec215d1-6484-4502-a6dd-f659943ccb94.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json

+ 0
- 1
common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 1
common-primitives/pipelines/data_transformation.construct_predictions.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json

+ 0
- 1
common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json

+ 0
- 1
common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json View File

@@ -1 +0,0 @@
../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json

+ 0
- 1
common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 1
common-primitives/pipelines/data_transformation.dataset_to_dataframe.Common/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json

+ 0
- 1
common-primitives/pipelines/data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json View File

@@ -1 +0,0 @@
{"id": "4ec215d1-6484-4502-a6dd-f659943ccb94", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T17:49:59.327063Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns.Common", "name": "Extracts columns", "digest": "7b9ba98e3b7b9d1d8e17547249c7a25cd8d58ec60d957217f772753e37526145"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [25]}}}, {"type": "PRIMITIVE", "primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns.Common", "name": "Extracts columns", "digest": "7b9ba98e3b7b9d1d8e17547249c7a25cd8d58ec60d957217f772753e37526145"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [6]}}}, {"type": "PRIMITIVE", "primitive": {"id": "09f252eb-215d-4e0b-9a60-fcd967f5e708", "version": "0.2.0", "python_path": "d3m.primitives.data_transformation.encoder.DistilTextEncoder", "name": "Text encoder", "digest": "e468d66d1eda057a61b2c79ecf5288f137778f47dac9eabdc60707a4941532a3"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"encoder_type": {"type": "VALUE", "data": "tfidf"}}}, {"type": "PRIMITIVE", "primitive": {"id": "e0ad06ce-b484-46b0-a478-c567e1ea7e02", "version": "0.2.0", "python_path": "d3m.primitives.learner.random_forest.DistilEnsembleForest", "name": "EnsembleForest", "digest": "4ba7a354b15ea626bf96aa771a2a3cba034ad5d0a8ccdbbf68bce2d828db1b4d"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "674a644333a3a481769591341591461b06de566fef7439010284739194e18af8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "a26edc0cc9bcf9121189186d621ff1b4cebb2afc76b6ef171d7d8194e55cf475"}

+ 0
- 71
common-primitives/pipelines/data_transformation.extract_columns.Common/pipeline.py View File

@@ -1,71 +0,0 @@
from d3m import index
from d3m.metadata.base import ArgumentType, Context
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: Simple profiler primitive
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.schema_discovery.profiler.Common'))
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: column_parser
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common'))
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step_2.add_output('produce')
pipeline_description.add_step(step_2)

# Step 3: Extract text column explicitly
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns.Common'))
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
step_3.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [25])
step_3.add_output('produce')
pipeline_description.add_step(step_3)

# Step 4: Extract target column explicitly
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns.Common'))
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
step_4.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [6])
step_4.add_output('produce')
pipeline_description.add_step(step_4)

# Step 5: encode text column
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.encoder.DistilTextEncoder'))
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce')
step_5.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
step_5.add_hyperparameter(name='encoder_type', argument_type=ArgumentType.VALUE, data = 'tfidf')
step_5.add_output('produce')
pipeline_description.add_step(step_5)

# Step 6: classifier
step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.learner.random_forest.DistilEnsembleForest'))
step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
step_6.add_output('produce')
pipeline_description.add_step(step_6)

# Step 7: construct output
step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common'))
step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
step_7.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_7.add_output('produce')
pipeline_description.add_step(step_7)

# Final Output
pipeline_description.add_output(name='output predictions', data_reference='steps.7.produce')

# Output json pipeline
blob = pipeline_description.to_json()
filename = blob[8:44] + '.json'
with open(filename, 'w') as outfile:
outfile.write(blob)


+ 0
- 1
common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 1
common-primitives/pipelines/data_transformation.extract_columns_by_semantic_types.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json View File

@@ -1 +0,0 @@
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json

+ 0
- 1
common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
{"id": "b523335c-0c47-4d02-a582-f69609cde1e8", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T19:51:17.782254Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.9.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "79674d68-9b93-4359-b385-7b5f60645b06", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_structural_types.Common", "name": "Extracts columns by structural type", "digest": "7805010b9581bb96c035fefa5943209c69a1e234f10d9057d487af42c0fd4830"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "f6315ca9-ca39-4e13-91ba-1964ee27281c", "version": "0.1.0", "python_path": "d3m.primitives.data_preprocessing.one_hot_encoder.PandasCommon", "name": "Pandas one hot encoder", "digest": "ed1217d4d7c017d8239b4f958c8e6ca0b3b67966ccb50cc5c578a9f14e465ec0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"use_columns": {"type": "VALUE", "data": [2, 5]}}}, {"type": "PRIMITIVE", "primitive": {"id": "3b09ba74-cc90-4f22-9e0a-0cf4f29a7e28", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.remove_columns.Common", "name": "Removes columns", "digest": "a725d149595186b85f1dea2bacbf4b853712b6a50eddb7c4c2295fabc3a04df1"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [25]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "505df38f9be4964ff19683ab3e185f19333fb35c26121c12a1c55bddd9d38f72"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "505df38f9be4964ff19683ab3e185f19333fb35c26121c12a1c55bddd9d38f72"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Target"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "37c2b19d-bdab-4a30-ba08-6be49edcc6af", "version": "0.4.0", "python_path": "d3m.primitives.classification.random_forest.Common", "name": "Random forest classifier", "digest": "f5f702fc561775a6064c64c008a519f605eb00ca80f59a5d5e39b1340c7c015e"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.7.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "674a644333a3a481769591341591461b06de566fef7439010284739194e18af8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.8.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "7929f79fa8e2aaddcbe66d0f592525081280549e0713198e583728ff88b0f895"}

+ 0
- 83
common-primitives/pipelines/data_transformation.extract_columns_by_structural_types.Common/pipeline.py View File

@@ -1,83 +0,0 @@
from d3m import index
from d3m.metadata.base import ArgumentType, Context
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name='inputs')

# Step 0: dataset_to_dataframe
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common'))
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0')
step_0.add_output('produce')
pipeline_description.add_step(step_0)

# Step 1: Simple profiler primitive
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.schema_discovery.profiler.Common'))
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_1.add_output('produce')
pipeline_description.add_step(step_1)

# Step 2: Extract columns by structural type
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_structural_types.Common'))
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce')
step_2.add_output('produce')
pipeline_description.add_step(step_2)

# Step 3: column_parser
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common'))
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce')
step_3.add_output('produce')
pipeline_description.add_step(step_3)

# Step 4 one hot encode
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_preprocessing.one_hot_encoder.PandasCommon'))
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce')
step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data = [2,5])
step_4.add_output('produce')
pipeline_description.add_step(step_4)

# Step 5 remove text
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.remove_columns.Common'))
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce')
step_5.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [25])
step_5.add_output('produce')
pipeline_description.add_step(step_5)

# Step 6 extract attributes
step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
step_6.add_hyperparameter(name="semantic_types", argument_type=ArgumentType.VALUE, data=["https://metadata.datadrivendiscovery.org/types/Attribute"],)
step_6.add_output('produce')
pipeline_description.add_step(step_6)

# Step 7 extract target
step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common'))
step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce')
step_7.add_hyperparameter(name="semantic_types", argument_type=ArgumentType.VALUE, data=["https://metadata.datadrivendiscovery.org/types/Target"],)
step_7.add_output('produce')
pipeline_description.add_step(step_7)

# Step 8: classifier
step_8 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.classification.random_forest.Common'))
step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce')
step_8.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce')
step_8.add_output('produce')
pipeline_description.add_step(step_8)

# Step 9: construct output
step_9 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common'))
step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce')
step_9.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce')
step_9.add_output('produce')
pipeline_description.add_step(step_9)

# Final Output
pipeline_description.add_output(name='output predictions', data_reference='steps.9.produce')

# Output json pipeline
blob = pipeline_description.to_json()
filename = blob[8:44] + '.json'
with open(filename, 'w') as outfile:
outfile.write(blob)


+ 0
- 1
common-primitives/pipelines/data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json View File

@@ -1 +0,0 @@
{"id": "a8c40699-c48d-4f12-aa18-639c5fb6baae", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T19:35:58.976691Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.4.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"parse_semantic_types": {"type": "VALUE", "data": ["http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", "http://schema.org/DateTime"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "59db88b9-dd81-4e50-8f43-8f2af959560b", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.grouping_field_compose.Common", "name": "Grouping Field Compose", "digest": "e93815bfdb1c82ce0e2fa61f092d6ee9bcf39367a27072accbb9f0dd9189fb03"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "76b5a479-c209-4d94-92b5-7eba7a4d4499", "version": "1.0.2", "python_path": "d3m.primitives.time_series_forecasting.vector_autoregression.VAR", "name": "VAR", "digest": "7e22a1e7fe228114a5788f16a8d3c7709ed3a98a90e9cc82e3b80ab5f232d352"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "da2c7d2605256f263ca4725fe7385be5e027a3ddadc8dbf7523ff98bcd016005"}

+ 0
- 100
common-primitives/pipelines/data_transformation.grouping_field_compose.Common/pipeline.py View File

@@ -1,100 +0,0 @@
from d3m import index
from d3m.metadata.base import ArgumentType
from d3m.metadata.pipeline import Pipeline, PrimitiveStep

# Creating pipeline
pipeline_description = Pipeline()
pipeline_description.add_input(name="inputs")

# Step 0: DS to DF on input DS
step_0 = PrimitiveStep(
primitive=index.get_primitive(
"d3m.primitives.data_transformation.dataset_to_dataframe.Common"
)
)
step_0.add_argument(
name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0"
)
step_0.add_output("produce")
pipeline_description.add_step(step_0)

# Step 1: Simple Profiler Column Role Annotation
step_1 = PrimitiveStep(
primitive=index.get_primitive("d3m.primitives.schema_discovery.profiler.Common")
)
step_1.add_argument(
name="inputs",
argument_type=ArgumentType.CONTAINER,
data_reference="steps.0.produce",
)
step_1.add_output("produce")
pipeline_description.add_step(step_1)

# Step 2: column parser on input DF
step_2 = PrimitiveStep(
primitive=index.get_primitive(
"d3m.primitives.data_transformation.column_parser.Common"
)
)
step_2.add_argument(
name="inputs",
argument_type=ArgumentType.CONTAINER,
data_reference="steps.1.produce",
)
step_2.add_output("produce")
step_2.add_hyperparameter(
name="parse_semantic_types",
argument_type=ArgumentType.VALUE,
data=[
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector",
"http://schema.org/DateTime",
],
)
pipeline_description.add_step(step_2)

# Step 3: Grouping Field Compose
step_3 = PrimitiveStep(
primitive=index.get_primitive(
"d3m.primitives.data_transformation.grouping_field_compose.Common"
)
)
step_3.add_argument(
name="inputs",
argument_type=ArgumentType.CONTAINER,
data_reference="steps.2.produce",
)
step_3.add_output("produce")
pipeline_description.add_step(step_3)

# Step 4: forecasting primitive
step_4 = PrimitiveStep(
primitive=index.get_primitive(
"d3m.primitives.time_series_forecasting.vector_autoregression.VAR"
)
)
step_4.add_argument(
name="inputs",
argument_type=ArgumentType.CONTAINER,
data_reference="steps.3.produce",
)
step_4.add_argument(
name="outputs",
argument_type=ArgumentType.CONTAINER,
data_reference="steps.3.produce",
)
step_4.add_output("produce")
pipeline_description.add_step(step_4)

# Final Output
pipeline_description.add_output(
name="output predictions", data_reference="steps.4.produce"
)

# Output json pipeline
blob = pipeline_description.to_json()
filename = blob[8:44] + ".json"
with open(filename, "w") as outfile:
outfile.write(blob)

+ 0
- 1
common-primitives/pipelines/data_transformation.horizontal_concat.DataFrameConcat/2b307634-f01e-412e-8d95-7e54afd4731f.json View File

@@ -1 +0,0 @@
../data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json

+ 0
- 1
common-primitives/pipelines/data_transformation.remove_columns.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 272
common-primitives/pipelines/data_transformation.rename_duplicate_name.DataFrameCommon/11ee9290-992d-4e48-97ed-1a6e4c15f92f.json View File

@@ -1,272 +0,0 @@
{
"context": "TESTING",
"created": "2019-02-12T02:01:52.663008Z",
"id": "11ee9290-992d-4e48-97ed-1a6e4c15f92f",
"inputs": [
{
"name": "inputs"
}
],
"outputs": [
{
"data": "steps.8.produce",
"name": "output predictions"
}
],
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"steps": [
{
"arguments": {
"inputs": {
"data": "inputs.0",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"name": "Extract a DataFrame from a Dataset",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"parse_semantic_types": {
"data": [
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector",
"http://schema.org/DateTime"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"name": "Parses strings into their types",
"python_path": "d3m.primitives.data_transformation.column_parser.Common",
"version": "0.6.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"separator": {
"data": "----",
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "7b067a78-4ad4-411d-9cf9-87bcee38ac73",
"name": "Rename all the duplicated name column in DataFrame",
"python_path": "d3m.primitives.data_transformation.rename_duplicate_name.DataFrameCommon",
"version": "0.2.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.2.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/CategoricalData"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.2.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"exclude_columns": {
"data": [
0
],
"type": "VALUE"
},
"semantic_types": {
"data": [
"http://schema.org/Integer",
"http://schema.org/Float"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.4.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
},
"use_semantic_types": {
"data": true,
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde",
"name": "sklearn.impute.SimpleImputer",
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.6.produce",
"type": "CONTAINER"
},
"outputs": {
"data": "steps.5.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "1dd82833-5692-39cb-84fb-2455683075f3",
"name": "sklearn.ensemble.forest.RandomForestClassifier",
"python_path": "d3m.primitives.classification.random_forest.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.7.produce",
"type": "CONTAINER"
},
"reference": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "8d38b340-f83f-4877-baaa-162f8e551736",
"name": "Construct pipeline predictions output",
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
}
]
}

+ 0
- 83
common-primitives/pipelines/evaluation.kfold_timeseries_split.Common/k-fold-timeseries-split.yml View File

@@ -1,83 +0,0 @@
id: 5bed1f23-ac17-4b52-9d06-a5b77a6aea51
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
source:
name: Jeffrey Gleason
created: "2019-04-08T16:18:27.250294Z"
context: TESTING
name: K-fold split of timeseries datasets
description: |
K-fold split of timeseries datasets for cross-validation.
inputs:
- name: folds
- name: full dataset
outputs:
- name: train datasets
data: steps.0.produce
- name: test datasets
data: steps.2.produce
- name: score datasets
data: steps.1.produce
steps:
# Step 0.
- type: PRIMITIVE
primitive:
id: 002f9ad1-46e3-40f4-89ed-eeffbb3a102b
version: 0.1.0
python_path: d3m.primitives.evaluation.kfold_time_series_split.Common
name: K-fold cross-validation timeseries dataset splits
arguments:
inputs:
type: CONTAINER
data: inputs.0
dataset:
type: CONTAINER
data: inputs.1
outputs:
- id: produce
- id: produce_score_data
# Step 1. We redact privileged attributes for both score and test splits.
- type: PRIMITIVE
primitive:
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
version: 0.2.0
python_path: d3m.primitives.evaluation.redact_columns.Common
name: Redact columns for evaluation
arguments:
inputs:
type: CONTAINER
data: steps.0.produce_score_data
outputs:
- id: produce
hyperparams:
semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/PrivilegedData
add_semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
- https://metadata.datadrivendiscovery.org/types/MissingData
# Step 2. We further redact targets in test split.
- type: PRIMITIVE
primitive:
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
version: 0.2.0
python_path: d3m.primitives.evaluation.redact_columns.Common
name: Redact columns for evaluation
arguments:
inputs:
type: CONTAINER
data: steps.1.produce
outputs:
- id: produce
hyperparams:
semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/TrueTarget
add_semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/RedactedTarget
- https://metadata.datadrivendiscovery.org/types/MissingData

+ 0
- 108
common-primitives/pipelines/operator.dataset_map.DataFrameCommon/k-fold-timeseries-split-raw.yml View File

@@ -1,108 +0,0 @@
# todo change name
id: 5bed1f23-ac17-4b52-9d06-a5b77a6aea51
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json
source:
name: Jeffrey Gleason
created: "2019-12-19T16:29:34.702933Z"
context: TESTING
name: K-fold split of timeseries datasets
description: |
K-fold split of timeseries datasets for cross-validation.
inputs:
- name: folds
- name: full dataset
outputs:
- name: train datasets
data: steps.2.produce
- name: test datasets
data: steps.4.produce
- name: score datasets
data: steps.3.produce
steps:
# Step 0. Simon Data Typing primitive to infer DateTime column
- type: PRIMITIVE
primitive:
id: d2fa8df2-6517-3c26-bafc-87b701c4043a
version: 1.2.2
python_path: d3m.primitives.data_cleaning.column_type_profiler.Simon
name: simon
# Step 1. Mapped Simon Data Typing primitive to infer DateTime column
- type: PRIMITIVE
primitive:
id: 5bef5738-1638-48d6-9935-72445f0eecdc
version: 0.1.0
python_path: d3m.primitives.operator.dataset_map.DataFrameCommon
name: Map DataFrame resources to new resources using provided primitive
arguments:
inputs:
type: CONTAINER
data: inputs.1
outputs:
- id: produce
hyperparams:
primitive:
type: PRIMITIVE
data: 0
# Step 2. K-fold cross-validation timeseries dataset splits
- type: PRIMITIVE
primitive:
id: 002f9ad1-46e3-40f4-89ed-eeffbb3a102b
version: 0.1.0
python_path: d3m.primitives.evaluation.kfold_time_series_split.Common
name: K-fold cross-validation timeseries dataset splits
arguments:
inputs:
type: CONTAINER
data: inputs.0
dataset:
type: CONTAINER
data: steps.1.produce
outputs:
- id: produce
- id: produce_score_data
# Step 3. We redact privileged attributes for both score and test splits.
- type: PRIMITIVE
primitive:
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
version: 0.2.0
python_path: d3m.primitives.evaluation.redact_columns.Common
name: Redact columns for evaluation
arguments:
inputs:
type: CONTAINER
data: steps.2.produce_score_data
outputs:
- id: produce
hyperparams:
semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/PrivilegedData
add_semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData
- https://metadata.datadrivendiscovery.org/types/MissingData
# Step 4. We further redact targets in test split.
- type: PRIMITIVE
primitive:
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6
version: 0.2.0
python_path: d3m.primitives.evaluation.redact_columns.Common
name: Redact columns for evaluation
arguments:
inputs:
type: CONTAINER
data: steps.3.produce
outputs:
- id: produce
hyperparams:
semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/TrueTarget
add_semantic_types:
type: VALUE
data:
- https://metadata.datadrivendiscovery.org/types/RedactedTarget
- https://metadata.datadrivendiscovery.org/types/MissingData

+ 0
- 247
common-primitives/pipelines/regression.xgboost_gbtree.DataFrameCommon/0f636602-6299-411b-9873-4b974cd393ba.json View File

@@ -1,247 +0,0 @@

{
"context": "TESTING",
"created": "2019-02-12T01:35:59.402796Z",
"id": "0f636602-6299-411b-9873-4b974cd393ba",
"inputs": [
{
"name": "inputs"
}
],
"outputs": [
{
"data": "steps.7.produce",
"name": "output predictions"
}
],
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json",
"steps": [
{
"arguments": {
"inputs": {
"data": "inputs.0",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65",
"name": "Extract a DataFrame from a Dataset",
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"parse_semantic_types": {
"data": [
"http://schema.org/Boolean",
"http://schema.org/Integer",
"http://schema.org/Float",
"https://metadata.datadrivendiscovery.org/types/FloatVector",
"http://schema.org/DateTime"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7",
"name": "Parses strings into their types",
"python_path": "d3m.primitives.data_transformation.column_parser.Common",
"version": "0.6.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/CategoricalData"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"exclude_columns": {
"data": [
0
],
"type": "VALUE"
},
"semantic_types": {
"data": [
"http://schema.org/Integer",
"http://schema.org/Float"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.0.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"semantic_types": {
"data": [
"https://metadata.datadrivendiscovery.org/types/TrueTarget"
],
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1",
"name": "Extracts columns by semantic type",
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.3.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
},
"use_semantic_types": {
"data": true,
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde",
"name": "sklearn.impute.SimpleImputer",
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn",
"version": "2019.6.7"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.5.produce",
"type": "CONTAINER"
},
"outputs": {
"data": "steps.4.produce",
"type": "CONTAINER"
}
},
"hyperparams": {
"return_result": {
"data": "replace",
"type": "VALUE"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "cdbb80e4-e9de-4caa-a710-16b5d727b959",
"name": "XGBoost GBTree regressor",
"python_path": "d3m.primitives.regression.xgboost_gbtree.Common",
"version": "0.1.0"
},
"type": "PRIMITIVE"
},
{
"arguments": {
"inputs": {
"data": "steps.6.produce",
"type": "CONTAINER"
},
"reference": {
"data": "steps.1.produce",
"type": "CONTAINER"
}
},
"outputs": [
{
"id": "produce"
}
],
"primitive": {
"id": "8d38b340-f83f-4877-baaa-162f8e551736",
"name": "Construct pipeline predictions output",
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common",
"version": "0.3.0"
},
"type": "PRIMITIVE"
}
]
}

+ 0
- 1
common-primitives/pipelines/schema_discovery.profiler.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json

+ 0
- 1
common-primitives/pipelines/schema_discovery.profiler.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json View File

@@ -1 +0,0 @@
../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json

+ 0
- 1
common-primitives/pipelines/schema_discovery.profiler.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json View File

@@ -1 +0,0 @@
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json

+ 0
- 44
common-primitives/run_pipelines.sh View File

@@ -1,44 +0,0 @@
#!/bin/bash

mkdir -p results

overall_result="0"

while IFS= read -r pipeline_run_file; do
pipeline_run_name="$(dirname "$pipeline_run_file")/$(basename -s .yml.gz "$(basename -s .yaml.gz "$pipeline_run_file")")"
primitive_name="$(basename "$(dirname "$pipeline_run_file")")"

if [[ -L "$pipeline_run_file" ]]; then
echo ">>> Skipping '$pipeline_run_file'."
continue
else
mkdir -p "results/$pipeline_run_name"
fi

pipelines_path="pipelines/$primitive_name"

if [[ ! -d "$pipelines_path" ]]; then
echo ">>> ERROR: Could not find pipelines for '$pipeline_run_file'."
overall_result="1"
continue
fi

echo ">>> Running '$pipeline_run_file'."
python3 -m d3m --pipelines-path "$pipelines_path" \
runtime \
--datasets /data/datasets --volumes /data/static_files \
fit-score --input-run "$pipeline_run_file" \
--output "results/$pipeline_run_name/predictions.csv" \
--scores "results/$pipeline_run_name/scores.csv" \
--output-run "results/$pipeline_run_name/pipeline_runs.yaml"
result="$?"

if [[ "$result" -eq 0 ]]; then
echo ">>> SUCCESS ($pipeline_run_file)"
else
echo ">>> ERROR ($pipeline_run_file)"
overall_result="1"
fi
done < <(find pipeline_runs -name '*.yml.gz' -or -name '*.yaml.gz')

exit "$overall_result"

+ 0
- 11
common-primitives/run_tests.py View File

@@ -1,11 +0,0 @@
#!/usr/bin/env python3

import sys
import unittest

runner = unittest.TextTestRunner(verbosity=1)

tests = unittest.TestLoader().discover('tests')

if not runner.run(tests).wasSuccessful():
sys.exit(1)

+ 0
- 28
common-primitives/setup.cfg View File

@@ -1,28 +0,0 @@
[pycodestyle]
max-line-length = 200

[metadata]
description-file = README.md

[mypy]
warn_redundant_casts = True
# TODO: Enable back once false positives are fixed.
# See: https://github.com/python/mypy/issues/4412
#warn_unused_ignores = True
warn_unused_configs = True
disallow_untyped_defs = True

# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300
[mypy-d3m.container.list]
ignore_errors = True

# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300
[mypy-d3m.metadata.hyperparams]
ignore_errors = True

# TODO: Remove once this is fixed: https://github.com/python/mypy/pull/4384#issuecomment-354033177
[mypy-d3m.primitive_interfaces.distance]
ignore_errors = True

[mypy-common_primitives.slacker.*]
ignore_errors = True

+ 0
- 65
common-primitives/setup.py View File

@@ -1,65 +0,0 @@
import os
import sys
from setuptools import setup, find_packages

PACKAGE_NAME = 'common_primitives'
MINIMUM_PYTHON_VERSION = 3, 6


def check_python_version():
"""Exit when the Python version is too low."""
if sys.version_info < MINIMUM_PYTHON_VERSION:
sys.exit("Python {}.{}+ is required.".format(*MINIMUM_PYTHON_VERSION))


def read_package_variable(key):
"""Read the value of a variable from the package without importing."""
module_path = os.path.join(PACKAGE_NAME, '__init__.py')
with open(module_path) as module:
for line in module:
parts = line.strip().split(' ')
if parts and parts[0] == key:
return parts[-1].strip("'")
raise KeyError("'{0}' not found in '{1}'".format(key, module_path))


def read_readme():
with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file:
return file.read()


def read_entry_points():
with open('entry_points.ini') as entry_points:
return entry_points.read()


check_python_version()
version = read_package_variable('__version__')

setup(
name=PACKAGE_NAME,
version=version,
description='D3M common primitives',
author=read_package_variable('__author__'),
packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
data_files=[('./', ['./entry_points.ini'])],
install_requires=[
'd3m',
'pandas',
'scikit-learn',
'numpy',
'lightgbm>=2.2.2,<=2.3.0',
'opencv-python-headless<=4.1.1.26,>=4.1',
'imageio>=2.3.0,<=2.6.0',
'pillow==6.2.1',
'xgboost>=0.81,<=0.90',
],
entry_points=read_entry_points(),
url='https://gitlab.com/datadrivendiscovery/common-primitives',
long_description=read_readme(),
long_description_content_type='text/markdown',
license='Apache-2.0',
classifiers=[
'License :: OSI Approved :: Apache Software License',
],
)

+ 0
- 2
common-primitives/sklearn-wrap/.gitignore View File

@@ -1,2 +0,0 @@
.pyc
__pycache__

+ 0
- 31
common-primitives/sklearn-wrap/requirements.txt View File

@@ -1,31 +0,0 @@
scikit-learn==0.22.0
pytypes==1.0b5
frozendict==1.2
numpy>=1.15.4,<=1.18.1
jsonschema==2.6.0
requests>=2.19.1,<=2.22.0
strict-rfc3339==0.7
rfc3987==1.3.8
webcolors>=1.8.1,<=1.10
dateparser>=0.7.0,<=0.7.2
python-dateutil==2.8.1
pandas==0.25
typing-inspect==0.5.0
GitPython>=2.1.11,<=3.0.5
jsonpath-ng==1.4.3
custom-inherit>=2.2.0,<=2.2.2
PyYAML>=5.1,<=5.3
pycurl>=7.43.0.2,<=7.43.0.3
pyarrow==0.15.1
gputil>=1.3.0,<=1.4.0
pyrsistent>=0.14.11,<=0.15.7
scipy>=1.2.1,<=1.4.1
openml==0.10.1
lightgbm>=2.2.2,<=2.3.0
opencv-python-headless<=4.1.1.26,>=4.1
imageio>=2.3.0,<=2.6.0
pillow==6.2.1
xgboost>=0.81,<=0.90
Jinja2==2.9.4
simplejson==3.12.0
gitdb2==2.0.6

+ 0
- 106
common-primitives/sklearn-wrap/setup.py View File

@@ -1,106 +0,0 @@
import os
from setuptools import setup, find_packages

PACKAGE_NAME = 'sklearn_wrap'


def read_package_variable(key):
"""Read the value of a variable from the package without importing."""
module_path = os.path.join(PACKAGE_NAME, '__init__.py')
with open(module_path) as module:
for line in module:
parts = line.strip().split(' ')
if parts and parts[0] == key:
return parts[-1].strip("'")
assert False, "'{0}' not found in '{1}'".format(key, module_path)


setup(
name=PACKAGE_NAME,
version=read_package_variable('__version__'),
description='Primitives created using the Sklearn auto wrapper',
author=read_package_variable('__author__'),
packages=find_packages(exclude=['contrib', 'docs', 'tests*']),
install_requires=[
'd3m',
'Jinja2==2.9.4',
'simplejson==3.12.0',
'scikit-learn==0.22.0',
],
url='https://gitlab.datadrivendiscovery.org/jpl/sklearn-wrapping',
entry_points = {
'd3m.primitives': [
'data_cleaning.string_imputer.SKlearn = sklearn_wrap.SKStringImputer:SKStringImputer',
'classification.gradient_boosting.SKlearn = sklearn_wrap.SKGradientBoostingClassifier:SKGradientBoostingClassifier',
'classification.quadratic_discriminant_analysis.SKlearn = sklearn_wrap.SKQuadraticDiscriminantAnalysis:SKQuadraticDiscriminantAnalysis',
'classification.decision_tree.SKlearn = sklearn_wrap.SKDecisionTreeClassifier:SKDecisionTreeClassifier',
'classification.sgd.SKlearn = sklearn_wrap.SKSGDClassifier:SKSGDClassifier',
'classification.nearest_centroid.SKlearn = sklearn_wrap.SKNearestCentroid:SKNearestCentroid',
'classification.mlp.SKlearn = sklearn_wrap.SKMLPClassifier:SKMLPClassifier',
'classification.bagging.SKlearn = sklearn_wrap.SKBaggingClassifier:SKBaggingClassifier',
'classification.linear_svc.SKlearn = sklearn_wrap.SKLinearSVC:SKLinearSVC',
'classification.linear_discriminant_analysis.SKlearn = sklearn_wrap.SKLinearDiscriminantAnalysis:SKLinearDiscriminantAnalysis',
'classification.passive_aggressive.SKlearn = sklearn_wrap.SKPassiveAggressiveClassifier:SKPassiveAggressiveClassifier',
'classification.gaussian_naive_bayes.SKlearn = sklearn_wrap.SKGaussianNB:SKGaussianNB',
'classification.ada_boost.SKlearn = sklearn_wrap.SKAdaBoostClassifier:SKAdaBoostClassifier',
'classification.random_forest.SKlearn = sklearn_wrap.SKRandomForestClassifier:SKRandomForestClassifier',
'classification.svc.SKlearn = sklearn_wrap.SKSVC:SKSVC',
'classification.multinomial_naive_bayes.SKlearn = sklearn_wrap.SKMultinomialNB:SKMultinomialNB',
'classification.dummy.SKlearn = sklearn_wrap.SKDummyClassifier:SKDummyClassifier',
'classification.extra_trees.SKlearn = sklearn_wrap.SKExtraTreesClassifier:SKExtraTreesClassifier',
'classification.logistic_regression.SKlearn = sklearn_wrap.SKLogisticRegression:SKLogisticRegression',
'classification.bernoulli_naive_bayes.SKlearn = sklearn_wrap.SKBernoulliNB:SKBernoulliNB',
'classification.k_neighbors.SKlearn = sklearn_wrap.SKKNeighborsClassifier:SKKNeighborsClassifier',
'regression.decision_tree.SKlearn = sklearn_wrap.SKDecisionTreeRegressor:SKDecisionTreeRegressor',
'regression.ada_boost.SKlearn = sklearn_wrap.SKAdaBoostRegressor:SKAdaBoostRegressor',
'regression.k_neighbors.SKlearn = sklearn_wrap.SKKNeighborsRegressor:SKKNeighborsRegressor',
'regression.linear.SKlearn = sklearn_wrap.SKLinearRegression:SKLinearRegression',
'regression.bagging.SKlearn = sklearn_wrap.SKBaggingRegressor:SKBaggingRegressor',
'regression.lasso_cv.SKlearn = sklearn_wrap.SKLassoCV:SKLassoCV',
'regression.elastic_net.SKlearn = sklearn_wrap.SKElasticNet:SKElasticNet',
'regression.ard.SKlearn = sklearn_wrap.SKARDRegression:SKARDRegression',
'regression.svr.SKlearn = sklearn_wrap.SKSVR:SKSVR',
'regression.ridge.SKlearn = sklearn_wrap.SKRidge:SKRidge',
'regression.gaussian_process.SKlearn = sklearn_wrap.SKGaussianProcessRegressor:SKGaussianProcessRegressor',
'regression.mlp.SKlearn = sklearn_wrap.SKMLPRegressor:SKMLPRegressor',
'regression.dummy.SKlearn = sklearn_wrap.SKDummyRegressor:SKDummyRegressor',
'regression.sgd.SKlearn = sklearn_wrap.SKSGDRegressor:SKSGDRegressor',
'regression.lasso.SKlearn = sklearn_wrap.SKLasso:SKLasso',
'regression.lars.SKlearn = sklearn_wrap.SKLars:SKLars',
'regression.extra_trees.SKlearn = sklearn_wrap.SKExtraTreesRegressor:SKExtraTreesRegressor',
'regression.linear_svr.SKlearn = sklearn_wrap.SKLinearSVR:SKLinearSVR',
'regression.random_forest.SKlearn = sklearn_wrap.SKRandomForestRegressor:SKRandomForestRegressor',
'regression.gradient_boosting.SKlearn = sklearn_wrap.SKGradientBoostingRegressor:SKGradientBoostingRegressor',
'regression.passive_aggressive.SKlearn = sklearn_wrap.SKPassiveAggressiveRegressor:SKPassiveAggressiveRegressor',
'regression.kernel_ridge.SKlearn = sklearn_wrap.SKKernelRidge:SKKernelRidge',
'data_preprocessing.max_abs_scaler.SKlearn = sklearn_wrap.SKMaxAbsScaler:SKMaxAbsScaler',
'data_preprocessing.normalizer.SKlearn = sklearn_wrap.SKNormalizer:SKNormalizer',
'data_preprocessing.robust_scaler.SKlearn = sklearn_wrap.SKRobustScaler:SKRobustScaler',
'data_preprocessing.tfidf_vectorizer.SKlearn = sklearn_wrap.SKTfidfVectorizer:SKTfidfVectorizer',
'data_transformation.one_hot_encoder.SKlearn = sklearn_wrap.SKOneHotEncoder:SKOneHotEncoder',
'data_preprocessing.truncated_svd.SKlearn = sklearn_wrap.SKTruncatedSVD:SKTruncatedSVD',
'feature_selection.select_percentile.SKlearn = sklearn_wrap.SKSelectPercentile:SKSelectPercentile',
'feature_extraction.pca.SKlearn = sklearn_wrap.SKPCA:SKPCA',
'data_preprocessing.count_vectorizer.SKlearn = sklearn_wrap.SKCountVectorizer:SKCountVectorizer',
'data_transformation.ordinal_encoder.SKlearn = sklearn_wrap.SKOrdinalEncoder:SKOrdinalEncoder',
'data_preprocessing.binarizer.SKlearn = sklearn_wrap.SKBinarizer:SKBinarizer',
'data_cleaning.missing_indicator.SKlearn = sklearn_wrap.SKMissingIndicator:SKMissingIndicator',
'feature_selection.select_fwe.SKlearn = sklearn_wrap.SKSelectFwe:SKSelectFwe',
'data_preprocessing.rbf_sampler.SKlearn = sklearn_wrap.SKRBFSampler:SKRBFSampler',
'data_preprocessing.min_max_scaler.SKlearn = sklearn_wrap.SKMinMaxScaler:SKMinMaxScaler',
'data_preprocessing.random_trees_embedding.SKlearn = sklearn_wrap.SKRandomTreesEmbedding:SKRandomTreesEmbedding',
'data_transformation.gaussian_random_projection.SKlearn = sklearn_wrap.SKGaussianRandomProjection:SKGaussianRandomProjection',
'feature_extraction.kernel_pca.SKlearn = sklearn_wrap.SKKernelPCA:SKKernelPCA',
'data_preprocessing.polynomial_features.SKlearn = sklearn_wrap.SKPolynomialFeatures:SKPolynomialFeatures',
'data_preprocessing.feature_agglomeration.SKlearn = sklearn_wrap.SKFeatureAgglomeration:SKFeatureAgglomeration',
'data_cleaning.imputer.SKlearn = sklearn_wrap.SKImputer:SKImputer',
'data_preprocessing.standard_scaler.SKlearn = sklearn_wrap.SKStandardScaler:SKStandardScaler',
'data_transformation.fast_ica.SKlearn = sklearn_wrap.SKFastICA:SKFastICA',
'data_preprocessing.quantile_transformer.SKlearn = sklearn_wrap.SKQuantileTransformer:SKQuantileTransformer',
'data_transformation.sparse_random_projection.SKlearn = sklearn_wrap.SKSparseRandomProjection:SKSparseRandomProjection',
'data_preprocessing.nystroem.SKlearn = sklearn_wrap.SKNystroem:SKNystroem',
'feature_selection.variance_threshold.SKlearn = sklearn_wrap.SKVarianceThreshold:SKVarianceThreshold',
'feature_selection.generic_univariate_select.SKlearn = sklearn_wrap.SKGenericUnivariateSelect:SKGenericUnivariateSelect',
],
},
)

+ 0
- 470
common-primitives/sklearn-wrap/sklearn_wrap/SKARDRegression.py View File

@@ -1,470 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.linear_model.bayes import ARDRegression


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
coef_: Optional[ndarray]
alpha_: Optional[float]
lambda_: Optional[ndarray]
sigma_: Optional[ndarray]
scores_: Optional[Sequence[Any]]
intercept_: Optional[float]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
n_iter = hyperparams.Bounded[int](
default=300,
lower=0,
upper=None,
description='Maximum number of iterations. Default is 300',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
tol = hyperparams.Bounded[float](
default=0.001,
lower=0,
upper=None,
description='Stop the algorithm if w has converged. Default is 1.e-3.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
alpha_1 = hyperparams.Hyperparameter[float](
default=1e-06,
description='Hyper-parameter : shape parameter for the Gamma distribution prior over the alpha parameter. Default is 1.e-6.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
alpha_2 = hyperparams.Hyperparameter[float](
default=1e-06,
description='Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the alpha parameter. Default is 1.e-6.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
lambda_1 = hyperparams.Hyperparameter[float](
default=1e-06,
description='Hyper-parameter : shape parameter for the Gamma distribution prior over the lambda parameter. Default is 1.e-6.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
lambda_2 = hyperparams.Hyperparameter[float](
default=1e-06,
description='Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the lambda parameter. Default is 1.e-6.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
threshold_lambda = hyperparams.Hyperparameter[float](
default=10000.0,
description='threshold for removing (pruning) weights with high precision from the computation. Default is 1.e+4.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
fit_intercept = hyperparams.UniformBool(
default=True,
description='whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). Default is True.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
normalize = hyperparams.UniformBool(
default=False,
description='If True, the regressors X will be normalized before regression. This parameter is ignored when `fit_intercept` is set to False. When the regressors are normalized, note that this makes the hyperparameters learnt more robust and almost independent of the number of samples. The same property is not valid for standardized data. However, if you wish to standardize, please use `preprocessing.StandardScaler` before calling `fit` on an estimator with `normalize=False`. copy_X : boolean, optional, default True. If True, X will be copied; else, it may be overwritten.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKARDRegression(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn ARDRegression
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.BAYESIAN_LINEAR_REGRESSION, ],
"name": "sklearn.linear_model.bayes.ARDRegression",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.ard.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html']},
"version": "2019.11.13",
"id": "966dd2c4-d439-3ad6-b49f-17706595606c",
"hyperparams_to_tune": ['n_iter'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None,
_copy_X: bool = True,
_verbose: bool = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = ARDRegression(
n_iter=self.hyperparams['n_iter'],
tol=self.hyperparams['tol'],
alpha_1=self.hyperparams['alpha_1'],
alpha_2=self.hyperparams['alpha_2'],
lambda_1=self.hyperparams['lambda_1'],
lambda_2=self.hyperparams['lambda_2'],
threshold_lambda=self.hyperparams['threshold_lambda'],
fit_intercept=self.hyperparams['fit_intercept'],
normalize=self.hyperparams['normalize'],
copy_X=_copy_X,
verbose=_verbose
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
coef_=None,
alpha_=None,
lambda_=None,
sigma_=None,
scores_=None,
intercept_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
coef_=getattr(self._clf, 'coef_', None),
alpha_=getattr(self._clf, 'alpha_', None),
lambda_=getattr(self._clf, 'lambda_', None),
sigma_=getattr(self._clf, 'sigma_', None),
scores_=getattr(self._clf, 'scores_', None),
intercept_=getattr(self._clf, 'intercept_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.coef_ = params['coef_']
self._clf.alpha_ = params['alpha_']
self._clf.lambda_ = params['lambda_']
self._clf.sigma_ = params['sigma_']
self._clf.scores_ = params['scores_']
self._clf.intercept_ = params['intercept_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['coef_'] is not None:
self._fitted = True
if params['alpha_'] is not None:
self._fitted = True
if params['lambda_'] is not None:
self._fitted = True
if params['sigma_'] is not None:
self._fitted = True
if params['scores_'] is not None:
self._fitted = True
if params['intercept_'] is not None:
self._fitted = True




@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKARDRegression.__doc__ = ARDRegression.__doc__

+ 0
- 498
common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostClassifier.py View File

@@ -1,498 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.ensemble.weight_boosting import AdaBoostClassifier


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
estimators_: Optional[Sequence[sklearn.base.BaseEstimator]]
classes_: Optional[ndarray]
n_classes_: Optional[int]
estimator_weights_: Optional[ndarray]
estimator_errors_: Optional[ndarray]
base_estimator_: Optional[object]
estimator_params: Optional[tuple]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
base_estimator = hyperparams.Constant(
default=None,
description='The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper `classes_` and `n_classes_` attributes.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
n_estimators = hyperparams.Bounded[int](
lower=1,
upper=None,
default=50,
description='The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
learning_rate = hyperparams.Uniform(
lower=0.01,
upper=2,
default=0.1,
description='Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
algorithm = hyperparams.Enumeration[str](
values=['SAMME.R', 'SAMME'],
default='SAMME.R',
description='If \'SAMME.R\' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If \'SAMME\' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKAdaBoostClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn AdaBoostClassifier
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ADABOOST, ],
"name": "sklearn.ensemble.weight_boosting.AdaBoostClassifier",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.ada_boost.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html']},
"version": "2019.11.13",
"id": "4210a6a6-14ab-4490-a7dc-460763e70e55",
"hyperparams_to_tune": ['learning_rate', 'n_estimators'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = AdaBoostClassifier(
base_estimator=self.hyperparams['base_estimator'],
n_estimators=self.hyperparams['n_estimators'],
learning_rate=self.hyperparams['learning_rate'],
algorithm=self.hyperparams['algorithm'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
estimators_=None,
classes_=None,
n_classes_=None,
estimator_weights_=None,
estimator_errors_=None,
base_estimator_=None,
estimator_params=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
estimators_=getattr(self._clf, 'estimators_', None),
classes_=getattr(self._clf, 'classes_', None),
n_classes_=getattr(self._clf, 'n_classes_', None),
estimator_weights_=getattr(self._clf, 'estimator_weights_', None),
estimator_errors_=getattr(self._clf, 'estimator_errors_', None),
base_estimator_=getattr(self._clf, 'base_estimator_', None),
estimator_params=getattr(self._clf, 'estimator_params', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.estimators_ = params['estimators_']
self._clf.classes_ = params['classes_']
self._clf.n_classes_ = params['n_classes_']
self._clf.estimator_weights_ = params['estimator_weights_']
self._clf.estimator_errors_ = params['estimator_errors_']
self._clf.base_estimator_ = params['base_estimator_']
self._clf.estimator_params = params['estimator_params']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['estimators_'] is not None:
self._fitted = True
if params['classes_'] is not None:
self._fitted = True
if params['n_classes_'] is not None:
self._fitted = True
if params['estimator_weights_'] is not None:
self._fitted = True
if params['estimator_errors_'] is not None:
self._fitted = True
if params['base_estimator_'] is not None:
self._fitted = True
if params['estimator_params'] is not None:
self._fitted = True


def log_likelihoods(self, *,
outputs: Outputs,
inputs: Inputs,
timeout: float = None,
iterations: int = None) -> CallResult[Sequence[float]]:
inputs = inputs.iloc[:, self._training_indices] # Get ndarray
outputs = outputs.iloc[:, self._target_column_indices]

if len(inputs.columns) and len(outputs.columns):

if outputs.shape[1] != self._clf.n_outputs_:
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")

log_proba = self._clf.predict_log_proba(inputs)

# Making it always a list, even when only one target.
if self._clf.n_outputs_ == 1:
log_proba = [log_proba]
classes = [self._clf.classes_]
else:
classes = self._clf.classes_

samples_length = inputs.shape[0]

log_likelihoods = []
for k in range(self._clf.n_outputs_):
# We have to map each class to its internal (numerical) index used in the learner.
# This allows "outputs" to contain string classes.
outputs_column = outputs.iloc[:, k]
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
mapped_outputs_column = outputs_column.map(classes_map)

# For each target column (column in "outputs"), for each sample (row) we pick the log
# likelihood for a given class.
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])

results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
results.columns = outputs.columns

for k in range(self._clf.n_outputs_):
column_metadata = outputs.metadata.query_column(k)
if 'name' in column_metadata:
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})

else:
results = d3m_dataframe(generate_metadata=True)

return CallResult(results)


def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]:
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names))))
output.columns = self._input_column_names
for i in range(len(self._input_column_names)):
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]})
return CallResult(output)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKAdaBoostClassifier.__doc__ = AdaBoostClassifier.__doc__

+ 0
- 437
common-primitives/sklearn-wrap/sklearn_wrap/SKAdaBoostRegressor.py View File

@@ -1,437 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.ensemble.weight_boosting import AdaBoostRegressor


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
estimators_: Optional[List[sklearn.tree.DecisionTreeRegressor]]
estimator_weights_: Optional[ndarray]
estimator_errors_: Optional[ndarray]
estimator_params: Optional[tuple]
base_estimator_: Optional[object]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
base_estimator = hyperparams.Constant(
default=None,
description='The base estimator from which the boosted ensemble is built. Support for sample weighting is required.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
n_estimators = hyperparams.Bounded[int](
lower=1,
upper=None,
default=50,
description='The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
learning_rate = hyperparams.Uniform(
lower=0.01,
upper=2,
default=0.1,
description='Learning rate shrinks the contribution of each regressor by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
loss = hyperparams.Enumeration[str](
values=['linear', 'square', 'exponential'],
default='linear',
description='The loss function to use when updating the weights after each boosting iteration.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKAdaBoostRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn AdaBoostRegressor
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ADABOOST, ],
"name": "sklearn.ensemble.weight_boosting.AdaBoostRegressor",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.ada_boost.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html']},
"version": "2019.11.13",
"id": "6cab1537-02e1-4dc4-9ebb-53fa2cbabedd",
"hyperparams_to_tune": ['learning_rate', 'n_estimators'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = AdaBoostRegressor(
base_estimator=self.hyperparams['base_estimator'],
n_estimators=self.hyperparams['n_estimators'],
learning_rate=self.hyperparams['learning_rate'],
loss=self.hyperparams['loss'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
estimators_=None,
estimator_weights_=None,
estimator_errors_=None,
estimator_params=None,
base_estimator_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
estimators_=getattr(self._clf, 'estimators_', None),
estimator_weights_=getattr(self._clf, 'estimator_weights_', None),
estimator_errors_=getattr(self._clf, 'estimator_errors_', None),
estimator_params=getattr(self._clf, 'estimator_params', None),
base_estimator_=getattr(self._clf, 'base_estimator_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.estimators_ = params['estimators_']
self._clf.estimator_weights_ = params['estimator_weights_']
self._clf.estimator_errors_ = params['estimator_errors_']
self._clf.estimator_params = params['estimator_params']
self._clf.base_estimator_ = params['base_estimator_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['estimators_'] is not None:
self._fitted = True
if params['estimator_weights_'] is not None:
self._fitted = True
if params['estimator_errors_'] is not None:
self._fitted = True
if params['estimator_params'] is not None:
self._fitted = True
if params['base_estimator_'] is not None:
self._fitted = True




def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]:
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names))))
output.columns = self._input_column_names
for i in range(len(self._input_column_names)):
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]})
return CallResult(output)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKAdaBoostRegressor.__doc__ = AdaBoostRegressor.__doc__

+ 0
- 589
common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingClassifier.py View File

@@ -1,589 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.ensemble.bagging import BaggingClassifier


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
base_estimator_: Optional[object]
estimators_: Optional[List[sklearn.tree.DecisionTreeClassifier]]
estimators_features_: Optional[List[ndarray]]
classes_: Optional[ndarray]
n_classes_: Optional[int]
oob_score_: Optional[float]
oob_decision_function_: Optional[List[ndarray]]
n_features_: Optional[int]
_max_features: Optional[int]
_max_samples: Optional[int]
_n_samples: Optional[int]
_seeds: Optional[ndarray]
estimator_params: Optional[tuple]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
n_estimators = hyperparams.Bounded[int](
default=10,
lower=1,
upper=None,
description='The number of base estimators in the ensemble.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_samples = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=1.0,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='percent',
description='The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_features = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=1.0,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='percent',
description='The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
bootstrap = hyperparams.Enumeration[str](
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'],
default='bootstrap',
description='Whether bootstrap samples are used when building trees.'
' And whether to use out-of-bag samples to estimate the generalization accuracy.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
bootstrap_features = hyperparams.UniformBool(
default=False,
description='Whether features are drawn with replacement.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
warm_start = hyperparams.UniformBool(
default=False,
description='When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. .. versionadded:: 0.17 *warm_start* constructor parameter.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
n_jobs = hyperparams.Union(
configuration=OrderedDict({
'limit': hyperparams.Bounded[int](
default=1,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'all_cores': hyperparams.Constant(
default=-1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='limit',
description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKBaggingClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn BaggingClassifier
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ENSEMBLE_LEARNING, ],
"name": "sklearn.ensemble.bagging.BaggingClassifier",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.bagging.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html']},
"version": "2019.11.13",
"id": "1b2a32a6-0ec5-3ca0-9386-b8b1f1b831d1",
"hyperparams_to_tune": ['n_estimators', 'max_samples', 'max_features'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None,
_verbose: int = 0) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = BaggingClassifier(
n_estimators=self.hyperparams['n_estimators'],
max_samples=self.hyperparams['max_samples'],
max_features=self.hyperparams['max_features'],
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'],
bootstrap_features=self.hyperparams['bootstrap_features'],
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'],
warm_start=self.hyperparams['warm_start'],
n_jobs=self.hyperparams['n_jobs'],
random_state=self.random_seed,
verbose=_verbose
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
base_estimator_=None,
estimators_=None,
estimators_features_=None,
classes_=None,
n_classes_=None,
oob_score_=None,
oob_decision_function_=None,
n_features_=None,
_max_features=None,
_max_samples=None,
_n_samples=None,
_seeds=None,
estimator_params=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
base_estimator_=getattr(self._clf, 'base_estimator_', None),
estimators_=getattr(self._clf, 'estimators_', None),
estimators_features_=getattr(self._clf, 'estimators_features_', None),
classes_=getattr(self._clf, 'classes_', None),
n_classes_=getattr(self._clf, 'n_classes_', None),
oob_score_=getattr(self._clf, 'oob_score_', None),
oob_decision_function_=getattr(self._clf, 'oob_decision_function_', None),
n_features_=getattr(self._clf, 'n_features_', None),
_max_features=getattr(self._clf, '_max_features', None),
_max_samples=getattr(self._clf, '_max_samples', None),
_n_samples=getattr(self._clf, '_n_samples', None),
_seeds=getattr(self._clf, '_seeds', None),
estimator_params=getattr(self._clf, 'estimator_params', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.base_estimator_ = params['base_estimator_']
self._clf.estimators_ = params['estimators_']
self._clf.estimators_features_ = params['estimators_features_']
self._clf.classes_ = params['classes_']
self._clf.n_classes_ = params['n_classes_']
self._clf.oob_score_ = params['oob_score_']
self._clf.oob_decision_function_ = params['oob_decision_function_']
self._clf.n_features_ = params['n_features_']
self._clf._max_features = params['_max_features']
self._clf._max_samples = params['_max_samples']
self._clf._n_samples = params['_n_samples']
self._clf._seeds = params['_seeds']
self._clf.estimator_params = params['estimator_params']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['base_estimator_'] is not None:
self._fitted = True
if params['estimators_'] is not None:
self._fitted = True
if params['estimators_features_'] is not None:
self._fitted = True
if params['classes_'] is not None:
self._fitted = True
if params['n_classes_'] is not None:
self._fitted = True
if params['oob_score_'] is not None:
self._fitted = True
if params['oob_decision_function_'] is not None:
self._fitted = True
if params['n_features_'] is not None:
self._fitted = True
if params['_max_features'] is not None:
self._fitted = True
if params['_max_samples'] is not None:
self._fitted = True
if params['_n_samples'] is not None:
self._fitted = True
if params['_seeds'] is not None:
self._fitted = True
if params['estimator_params'] is not None:
self._fitted = True


def log_likelihoods(self, *,
outputs: Outputs,
inputs: Inputs,
timeout: float = None,
iterations: int = None) -> CallResult[Sequence[float]]:
inputs = inputs.iloc[:, self._training_indices] # Get ndarray
outputs = outputs.iloc[:, self._target_column_indices]

if len(inputs.columns) and len(outputs.columns):

if outputs.shape[1] != self._clf.n_outputs_:
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")

log_proba = self._clf.predict_log_proba(inputs)

# Making it always a list, even when only one target.
if self._clf.n_outputs_ == 1:
log_proba = [log_proba]
classes = [self._clf.classes_]
else:
classes = self._clf.classes_

samples_length = inputs.shape[0]

log_likelihoods = []
for k in range(self._clf.n_outputs_):
# We have to map each class to its internal (numerical) index used in the learner.
# This allows "outputs" to contain string classes.
outputs_column = outputs.iloc[:, k]
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
mapped_outputs_column = outputs_column.map(classes_map)

# For each target column (column in "outputs"), for each sample (row) we pick the log
# likelihood for a given class.
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])

results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
results.columns = outputs.columns

for k in range(self._clf.n_outputs_):
column_metadata = outputs.metadata.query_column(k)
if 'name' in column_metadata:
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})

else:
results = d3m_dataframe(generate_metadata=True)

return CallResult(results)


@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKBaggingClassifier.__doc__ = BaggingClassifier.__doc__

+ 0
- 533
common-primitives/sklearn-wrap/sklearn_wrap/SKBaggingRegressor.py View File

@@ -1,533 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.ensemble.bagging import BaggingRegressor


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
estimators_: Optional[List[sklearn.tree.DecisionTreeRegressor]]
estimators_features_: Optional[List[ndarray]]
oob_score_: Optional[float]
oob_prediction_: Optional[ndarray]
base_estimator_: Optional[object]
n_features_: Optional[int]
_max_features: Optional[int]
_max_samples: Optional[int]
_n_samples: Optional[int]
_seeds: Optional[ndarray]
estimator_params: Optional[tuple]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
base_estimator = hyperparams.Constant(
default=None,
description='The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
n_estimators = hyperparams.Bounded[int](
default=10,
lower=1,
upper=None,
description='The number of base estimators in the ensemble.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_samples = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=1.0,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='percent',
description='The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_features = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=1.0,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='percent',
description='The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
bootstrap = hyperparams.Enumeration[str](
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'],
default='bootstrap',
description='Whether bootstrap samples are used when building trees.'
' And whether to use out-of-bag samples to estimate the generalization accuracy.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
bootstrap_features = hyperparams.UniformBool(
default=False,
description='Whether features are drawn with replacement.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
warm_start = hyperparams.UniformBool(
default=False,
description='When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary <warm_start>`.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
n_jobs = hyperparams.Union(
configuration=OrderedDict({
'limit': hyperparams.Bounded[int](
default=1,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'all_cores': hyperparams.Constant(
default=-1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='limit',
description='The number of jobs to run in parallel for both `fit` and `predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKBaggingRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn BaggingRegressor
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ENSEMBLE_LEARNING, ],
"name": "sklearn.ensemble.bagging.BaggingRegressor",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.bagging.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html']},
"version": "2019.11.13",
"id": "0dbc4b6d-aa57-4f11-ab18-36125880151b",
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None,
_verbose: int = 0) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = BaggingRegressor(
base_estimator=self.hyperparams['base_estimator'],
n_estimators=self.hyperparams['n_estimators'],
max_samples=self.hyperparams['max_samples'],
max_features=self.hyperparams['max_features'],
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'],
bootstrap_features=self.hyperparams['bootstrap_features'],
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'],
warm_start=self.hyperparams['warm_start'],
n_jobs=self.hyperparams['n_jobs'],
random_state=self.random_seed,
verbose=_verbose
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
estimators_=None,
estimators_features_=None,
oob_score_=None,
oob_prediction_=None,
base_estimator_=None,
n_features_=None,
_max_features=None,
_max_samples=None,
_n_samples=None,
_seeds=None,
estimator_params=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
estimators_=getattr(self._clf, 'estimators_', None),
estimators_features_=getattr(self._clf, 'estimators_features_', None),
oob_score_=getattr(self._clf, 'oob_score_', None),
oob_prediction_=getattr(self._clf, 'oob_prediction_', None),
base_estimator_=getattr(self._clf, 'base_estimator_', None),
n_features_=getattr(self._clf, 'n_features_', None),
_max_features=getattr(self._clf, '_max_features', None),
_max_samples=getattr(self._clf, '_max_samples', None),
_n_samples=getattr(self._clf, '_n_samples', None),
_seeds=getattr(self._clf, '_seeds', None),
estimator_params=getattr(self._clf, 'estimator_params', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.estimators_ = params['estimators_']
self._clf.estimators_features_ = params['estimators_features_']
self._clf.oob_score_ = params['oob_score_']
self._clf.oob_prediction_ = params['oob_prediction_']
self._clf.base_estimator_ = params['base_estimator_']
self._clf.n_features_ = params['n_features_']
self._clf._max_features = params['_max_features']
self._clf._max_samples = params['_max_samples']
self._clf._n_samples = params['_n_samples']
self._clf._seeds = params['_seeds']
self._clf.estimator_params = params['estimator_params']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['estimators_'] is not None:
self._fitted = True
if params['estimators_features_'] is not None:
self._fitted = True
if params['oob_score_'] is not None:
self._fitted = True
if params['oob_prediction_'] is not None:
self._fitted = True
if params['base_estimator_'] is not None:
self._fitted = True
if params['n_features_'] is not None:
self._fitted = True
if params['_max_features'] is not None:
self._fitted = True
if params['_max_samples'] is not None:
self._fitted = True
if params['_n_samples'] is not None:
self._fitted = True
if params['_seeds'] is not None:
self._fitted = True
if params['estimator_params'] is not None:
self._fitted = True




@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKBaggingRegressor.__doc__ = BaggingRegressor.__doc__

+ 0
- 508
common-primitives/sklearn-wrap/sklearn_wrap/SKBernoulliNB.py View File

@@ -1,508 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.naive_bayes import BernoulliNB


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
class_log_prior_: Optional[ndarray]
feature_log_prob_: Optional[ndarray]
class_count_: Optional[ndarray]
feature_count_: Optional[ndarray]
classes_: Optional[ndarray]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
alpha = hyperparams.Bounded[float](
default=1,
lower=0,
upper=None,
description='Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
binarize = hyperparams.Union(
configuration=OrderedDict({
'float': hyperparams.Bounded[float](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='float',
description='Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
fit_prior = hyperparams.UniformBool(
default=True,
description='Whether to learn class prior probabilities or not. If false, a uniform prior will be used.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKBernoulliNB(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams],
ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn BernoulliNB
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER, ],
"name": "sklearn.naive_bayes.BernoulliNB",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.bernoulli_naive_bayes.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html']},
"version": "2019.11.13",
"id": "dfb1004e-02ac-3399-ba57-8a95639312cd",
"hyperparams_to_tune": ['alpha', 'binarize', 'fit_prior'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = BernoulliNB(
alpha=self.hyperparams['alpha'],
binarize=self.hyperparams['binarize'],
fit_prior=self.hyperparams['fit_prior'],
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._training_inputs is None or self._training_outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.partial_fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
class_log_prior_=None,
feature_log_prob_=None,
class_count_=None,
feature_count_=None,
classes_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
class_log_prior_=getattr(self._clf, 'class_log_prior_', None),
feature_log_prob_=getattr(self._clf, 'feature_log_prob_', None),
class_count_=getattr(self._clf, 'class_count_', None),
feature_count_=getattr(self._clf, 'feature_count_', None),
classes_=getattr(self._clf, 'classes_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.class_log_prior_ = params['class_log_prior_']
self._clf.feature_log_prob_ = params['feature_log_prob_']
self._clf.class_count_ = params['class_count_']
self._clf.feature_count_ = params['feature_count_']
self._clf.classes_ = params['classes_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['class_log_prior_'] is not None:
self._fitted = True
if params['feature_log_prob_'] is not None:
self._fitted = True
if params['class_count_'] is not None:
self._fitted = True
if params['feature_count_'] is not None:
self._fitted = True
if params['classes_'] is not None:
self._fitted = True


def log_likelihoods(self, *,
outputs: Outputs,
inputs: Inputs,
timeout: float = None,
iterations: int = None) -> CallResult[Sequence[float]]:
inputs = inputs.iloc[:, self._training_indices] # Get ndarray
outputs = outputs.iloc[:, self._target_column_indices]

if len(inputs.columns) and len(outputs.columns):

if outputs.shape[1] != self._clf.n_outputs_:
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")

log_proba = self._clf.predict_log_proba(inputs)

# Making it always a list, even when only one target.
if self._clf.n_outputs_ == 1:
log_proba = [log_proba]
classes = [self._clf.classes_]
else:
classes = self._clf.classes_

samples_length = inputs.shape[0]

log_likelihoods = []
for k in range(self._clf.n_outputs_):
# We have to map each class to its internal (numerical) index used in the learner.
# This allows "outputs" to contain string classes.
outputs_column = outputs.iloc[:, k]
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
mapped_outputs_column = outputs_column.map(classes_map)

# For each target column (column in "outputs"), for each sample (row) we pick the log
# likelihood for a given class.
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])

results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
results.columns = outputs.columns

for k in range(self._clf.n_outputs_):
column_metadata = outputs.metadata.query_column(k)
if 'name' in column_metadata:
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})

else:
results = d3m_dataframe(generate_metadata=True)

return CallResult(results)


@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKBernoulliNB.__doc__ = BernoulliNB.__doc__

+ 0
- 330
common-primitives/sklearn-wrap/sklearn_wrap/SKBinarizer.py View File

@@ -1,330 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.preprocessing.data import Binarizer


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase


Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
threshold = hyperparams.Bounded[float](
default=0.0,
lower=0.0,
upper=None,
description='Feature values below or equal to this are replaced by 0, above it by 1. Threshold may not be less than 0 for operations on sparse matrices.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
default='https://metadata.datadrivendiscovery.org/types/Attribute',
description='Decides what semantic type to attach to generated attributes',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKBinarizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn Binarizer
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ],
"name": "sklearn.preprocessing.data.Binarizer",
"primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
"python_path": "d3m.primitives.data_preprocessing.binarizer.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html']},
"version": "2019.11.13",
"id": "13777068-9dc0-3c5b-b4da-99350d67ee3f",
"hyperparams_to_tune": ['threshold'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = Binarizer(
threshold=self.hyperparams['threshold'],
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
def set_training_data(self, *, inputs: Inputs) -> None:
self._inputs = inputs
self._fitted = False
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._fitted:
return CallResult(None)

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if self._training_inputs is None:
return CallResult(None)

if len(self._training_indices) > 0:
self._clf.fit(self._training_inputs)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
sk_inputs = inputs.iloc[:, self._training_indices]
output_columns = []
if len(self._training_indices) > 0:
sk_output = self._clf.transform(sk_inputs)
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
outputs = self._wrap_predictions(inputs, sk_output)
if len(outputs.columns) == len(self._input_column_names):
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._training_indices,
columns_list=output_columns)
return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
self._fitted = True



@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_columns'],
exclude_columns=hyperparams['exclude_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=True)
target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
return outputs


@classmethod
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int],
outputs_metadata: metadata_base.DataMetadata, hyperparams):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in input_indices:
column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)

column_metadata = OrderedDict(inputs_metadata.query_column(column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = set()
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

# If outputs has more columns than index, add Attribute Type to all remaining
if outputs_length > len(input_indices):
for column_index in range(len(input_indices), outputs_length):
column_metadata = OrderedDict()
semantic_types = set()
semantic_types.add(hyperparams["return_semantic_type"])
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = list(semantic_types)
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKBinarizer.__doc__ = Binarizer.__doc__

+ 0
- 490
common-primitives/sklearn-wrap/sklearn_wrap/SKCountVectorizer.py View File

@@ -1,490 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.feature_extraction.text import CountVectorizer


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase
from d3m.metadata.base import ALL_ELEMENTS


Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
vocabulary_: Optional[Sequence[dict]]
stop_words_: Optional[Sequence[set]]
fixed_vocabulary_: Optional[Sequence[bool]]
_stop_words_id: Optional[Sequence[int]]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]


class Hyperparams(hyperparams.Hyperparams):
strip_accents = hyperparams.Union(
configuration=OrderedDict({
'accents': hyperparams.Enumeration[str](
default='ascii',
values=['ascii', 'unicode'],
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Remove accents during the preprocessing step. \'ascii\' is a fast method that only works on characters that have an direct ASCII mapping. \'unicode\' is a slightly slower method that works on any characters. None (default) does nothing.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
analyzer = hyperparams.Enumeration[str](
default='word',
values=['word', 'char', 'char_wb'],
description='Whether the feature should be made of word or character n-grams. Option \'char_wb\' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
ngram_range = hyperparams.SortedList(
elements=hyperparams.Bounded[int](1, None, 1),
default=(1, 1),
min_size=2,
max_size=2,
description='The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
stop_words = hyperparams.Union(
configuration=OrderedDict({
'string': hyperparams.Hyperparameter[str](
default='english',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'list': hyperparams.List(
elements=hyperparams.Hyperparameter[str](''),
default=[],
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='If \'english\', a built-in stop word list for English is used. If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == \'word\'``. If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
lowercase = hyperparams.UniformBool(
default=True,
description='Convert all characters to lowercase before tokenizing.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
token_pattern = hyperparams.Hyperparameter[str](
default='(?u)\\b\w\w+\\b',
description='Regular expression denoting what constitutes a "token", only used if ``analyzer == \'word\'``. The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_df = hyperparams.Union(
configuration=OrderedDict({
'proportion': hyperparams.Bounded[float](
default=1.0,
lower=0.0,
upper=1.0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'absolute': hyperparams.Bounded[int](
default=1,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='proportion',
description='When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_df = hyperparams.Union(
configuration=OrderedDict({
'proportion': hyperparams.Bounded[float](
default=1.0,
lower=0.0,
upper=1.0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'absolute': hyperparams.Bounded[int](
default=1,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_features = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
default=1,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
binary = hyperparams.UniformBool(
default=False,
description='If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)

class SKCountVectorizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn CountVectorizer
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.CountVectorizer.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.MINIMUM_REDUNDANCY_FEATURE_SELECTION, ],
"name": "sklearn.feature_extraction.text.CountVectorizer",
"primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
"python_path": "d3m.primitives.data_preprocessing.count_vectorizer.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.CountVectorizer.html']},
"version": "2019.11.13",
"id": "0609859b-8ed9-397f-ac7a-7c4f63863560",
"hyperparams_to_tune": ['max_df', 'min_df'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# True
self._clf = list()
self._training_inputs = None
self._target_names = None
self._training_indices = None
self._fitted = False
def set_training_data(self, *, inputs: Inputs) -> None:
self._inputs = inputs
self._fitted = False
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._fitted:
return CallResult(None)

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)

if self._training_inputs is None:
raise ValueError("Missing training data.")

if len(self._training_indices) > 0:
for column_index in range(len(self._training_inputs.columns)):
clf = self._create_new_sklearn_estimator()
clf.fit(self._training_inputs.iloc[:, column_index])
self._clf.append(clf)

self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
sk_inputs, training_indices = self._get_columns_to_fit(inputs, self.hyperparams)
else:
training_indices = list(range(len(inputs)))

# Iterating over all estimators and call transform on them.
# No. of estimators should be equal to the number of columns in the input
if len(self._clf) != len(sk_inputs.columns):
raise RuntimeError("Input data does not have the same number of columns as training data")
outputs = []
if len(self._training_indices) > 0:
for column_index in range(len(sk_inputs.columns)):
clf = self._clf[column_index]
output = clf.transform(sk_inputs.iloc[:, column_index])
column_name = sk_inputs.columns[column_index]

if sparse.issparse(output):
output = output.toarray()
output = self._wrap_predictions(inputs, output)

# Updating column names.
output.columns = map(lambda x: "{}_{}".format(column_name, x), clf.get_feature_names())
for i, name in enumerate(clf.get_feature_names()):
output.metadata = output.metadata.update((ALL_ELEMENTS, i), {'name': name})

outputs.append(output)
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._training_indices,
columns_list=outputs)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
vocabulary_=None,
stop_words_=None,
fixed_vocabulary_=None,
_stop_words_id=None,
training_indices_=self._training_indices,
target_names_=self._target_names
)

return Params(
vocabulary_=list(map(lambda clf: getattr(clf, 'vocabulary_', None), self._clf)),
stop_words_=list(map(lambda clf: getattr(clf, 'stop_words_', None), self._clf)),
fixed_vocabulary_=list(map(lambda clf: getattr(clf, 'fixed_vocabulary_', None), self._clf)),
_stop_words_id=list(map(lambda clf: getattr(clf, '_stop_words_id', None), self._clf)),
training_indices_=self._training_indices,
target_names_=self._target_names
)

def set_params(self, *, params: Params) -> None:
for param, val in params.items():
if val is not None and param not in ['target_names_', 'training_indices_']:
self._clf = list(map(lambda x: self._create_new_sklearn_estimator(), val))
break
for index in range(len(self._clf)):
for param, val in params.items():
if val is not None:
setattr(self._clf[index], param, val[index])
else:
setattr(self._clf[index], param, None)
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._fitted = False
if params['vocabulary_'] is not None:
self._fitted = True
if params['stop_words_'] is not None:
self._fitted = True
if params['fixed_vocabulary_'] is not None:
self._fitted = True
if params['_stop_words_id'] is not None:
self._fitted = True

def _create_new_sklearn_estimator(self):
clf = CountVectorizer(
strip_accents=self.hyperparams['strip_accents'],
analyzer=self.hyperparams['analyzer'],
ngram_range=self.hyperparams['ngram_range'],
stop_words=self.hyperparams['stop_words'],
lowercase=self.hyperparams['lowercase'],
token_pattern=self.hyperparams['token_pattern'],
max_df=self.hyperparams['max_df'],
min_df=self.hyperparams['min_df'],
max_features=self.hyperparams['max_features'],
binary=self.hyperparams['binary'],
)
return clf




@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_columns'],
exclude_columns=hyperparams['exclude_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (str,)
accepted_semantic_types = set(["http://schema.org/Text",])
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), []
target_names = []
target_semantic_type = []
target_column_indices = []
metadata = data.metadata
target_column_indices.extend(metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget'))

for column_index in target_column_indices:
if column_index is metadata_base.ALL_ELEMENTS:
continue
column_index = typing.cast(metadata_base.SimpleSelectorSegment, column_index)
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
target_names.append(column_metadata.get('name', str(column_index)))
target_semantic_type.append(column_metadata.get('semantic_types', []))

targets = data.iloc[:, target_column_indices]
return targets, target_names, target_semantic_type

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=True)
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/Attribute')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKCountVectorizer.__doc__ = CountVectorizer.__doc__

+ 0
- 621
common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeClassifier.py View File

@@ -1,621 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.tree.tree import DecisionTreeClassifier
import numpy


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
classes_: Optional[Union[ndarray, List[ndarray]]]
max_features_: Optional[int]
n_classes_: Optional[Union[numpy.int64, List[numpy.int64]]]
n_features_: Optional[int]
n_outputs_: Optional[int]
tree_: Optional[object]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
criterion = hyperparams.Enumeration[str](
values=['gini', 'entropy'],
default='gini',
description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
splitter = hyperparams.Enumeration[str](
values=['best', 'random'],
default='best',
description='The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_depth = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
default=10,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_split = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
default=2,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_leaf = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
default=1,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=0.5,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_weight_fraction_leaf = hyperparams.Bounded[float](
default=0,
lower=0,
upper=0.5,
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_leaf_nodes = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_features = hyperparams.Union(
configuration=OrderedDict({
'specified_int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'calculated': hyperparams.Enumeration[str](
values=['auto', 'sqrt', 'log2'],
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_impurity_decrease = hyperparams.Bounded[float](
default=0.0,
lower=0.0,
upper=None,
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
class_weight = hyperparams.Union(
configuration=OrderedDict({
'str': hyperparams.Constant(
default='balanced',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
presort = hyperparams.UniformBool(
default=False,
description='Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large datasets, setting this to true may slow down the training process. When using either a smaller dataset or a restricted depth, this may speed up the training.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKDecisionTreeClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn DecisionTreeClassifier
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ],
"name": "sklearn.tree.tree.DecisionTreeClassifier",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.decision_tree.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html']},
"version": "2019.11.13",
"id": "e20d003d-6a9f-35b0-b4b5-20e42b30282a",
"hyperparams_to_tune": ['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = DecisionTreeClassifier(
criterion=self.hyperparams['criterion'],
splitter=self.hyperparams['splitter'],
max_depth=self.hyperparams['max_depth'],
min_samples_split=self.hyperparams['min_samples_split'],
min_samples_leaf=self.hyperparams['min_samples_leaf'],
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'],
max_leaf_nodes=self.hyperparams['max_leaf_nodes'],
max_features=self.hyperparams['max_features'],
min_impurity_decrease=self.hyperparams['min_impurity_decrease'],
class_weight=self.hyperparams['class_weight'],
presort=self.hyperparams['presort'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
classes_=None,
max_features_=None,
n_classes_=None,
n_features_=None,
n_outputs_=None,
tree_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
classes_=getattr(self._clf, 'classes_', None),
max_features_=getattr(self._clf, 'max_features_', None),
n_classes_=getattr(self._clf, 'n_classes_', None),
n_features_=getattr(self._clf, 'n_features_', None),
n_outputs_=getattr(self._clf, 'n_outputs_', None),
tree_=getattr(self._clf, 'tree_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.classes_ = params['classes_']
self._clf.max_features_ = params['max_features_']
self._clf.n_classes_ = params['n_classes_']
self._clf.n_features_ = params['n_features_']
self._clf.n_outputs_ = params['n_outputs_']
self._clf.tree_ = params['tree_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['classes_'] is not None:
self._fitted = True
if params['max_features_'] is not None:
self._fitted = True
if params['n_classes_'] is not None:
self._fitted = True
if params['n_features_'] is not None:
self._fitted = True
if params['n_outputs_'] is not None:
self._fitted = True
if params['tree_'] is not None:
self._fitted = True


def log_likelihoods(self, *,
outputs: Outputs,
inputs: Inputs,
timeout: float = None,
iterations: int = None) -> CallResult[Sequence[float]]:
inputs = inputs.iloc[:, self._training_indices] # Get ndarray
outputs = outputs.iloc[:, self._target_column_indices]

if len(inputs.columns) and len(outputs.columns):

if outputs.shape[1] != self._clf.n_outputs_:
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")

log_proba = self._clf.predict_log_proba(inputs)

# Making it always a list, even when only one target.
if self._clf.n_outputs_ == 1:
log_proba = [log_proba]
classes = [self._clf.classes_]
else:
classes = self._clf.classes_

samples_length = inputs.shape[0]

log_likelihoods = []
for k in range(self._clf.n_outputs_):
# We have to map each class to its internal (numerical) index used in the learner.
# This allows "outputs" to contain string classes.
outputs_column = outputs.iloc[:, k]
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
mapped_outputs_column = outputs_column.map(classes_map)

# For each target column (column in "outputs"), for each sample (row) we pick the log
# likelihood for a given class.
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])

results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
results.columns = outputs.columns

for k in range(self._clf.n_outputs_):
column_metadata = outputs.metadata.query_column(k)
if 'name' in column_metadata:
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})

else:
results = d3m_dataframe(generate_metadata=True)

return CallResult(results)


def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]:
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names))))
output.columns = self._input_column_names
for i in range(len(self._input_column_names)):
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]})
return CallResult(output)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKDecisionTreeClassifier.__doc__ = DecisionTreeClassifier.__doc__

+ 0
- 565
common-primitives/sklearn-wrap/sklearn_wrap/SKDecisionTreeRegressor.py View File

@@ -1,565 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.tree.tree import DecisionTreeRegressor


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
max_features_: Optional[int]
n_features_: Optional[int]
n_outputs_: Optional[int]
tree_: Optional[object]
classes_: Optional[Union[ndarray, List[ndarray]]]
n_classes_: Optional[Union[numpy.int64, List[numpy.int64]]]
class_weight: Optional[Union[str, dict, List[dict]]]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
criterion = hyperparams.Enumeration[str](
values=['mse', 'friedman_mse', 'mae'],
default='mse',
description='The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
splitter = hyperparams.Enumeration[str](
values=['best', 'random'],
default='best',
description='The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_depth = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=5,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_split = hyperparams.Union(
configuration=OrderedDict({
'float': hyperparams.Bounded[float](
lower=0,
upper=1,
default=1.0,
description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=2,
description='Minimum number.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='int',
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_leaf = hyperparams.Union(
configuration=OrderedDict({
'percent': hyperparams.Bounded[float](
lower=0,
upper=0.5,
default=0.25,
description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'absolute': hyperparams.Bounded[int](
lower=1,
upper=None,
default=1,
description='Minimum number.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_weight_fraction_leaf = hyperparams.Bounded[float](
default=0,
lower=0,
upper=0.5,
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_leaf_nodes = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=10,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_features = hyperparams.Union(
configuration=OrderedDict({
'specified_int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'calculated': hyperparams.Enumeration[str](
values=['auto', 'sqrt', 'log2'],
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='calculated',
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_impurity_decrease = hyperparams.Bounded[float](
default=0.0,
lower=0.0,
upper=None,
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
presort = hyperparams.UniformBool(
default=False,
description='Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large datasets, setting this to true may slow down the training process. When using either a smaller dataset or a restricted depth, this may speed up the training.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKDecisionTreeRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn DecisionTreeRegressor
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ],
"name": "sklearn.tree.tree.DecisionTreeRegressor",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.decision_tree.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html']},
"version": "2019.11.13",
"id": "6c420bd8-01d1-321f-9a35-afc4b758a5c6",
"hyperparams_to_tune": ['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = DecisionTreeRegressor(
criterion=self.hyperparams['criterion'],
splitter=self.hyperparams['splitter'],
max_depth=self.hyperparams['max_depth'],
min_samples_split=self.hyperparams['min_samples_split'],
min_samples_leaf=self.hyperparams['min_samples_leaf'],
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'],
max_leaf_nodes=self.hyperparams['max_leaf_nodes'],
max_features=self.hyperparams['max_features'],
min_impurity_decrease=self.hyperparams['min_impurity_decrease'],
presort=self.hyperparams['presort'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
max_features_=None,
n_features_=None,
n_outputs_=None,
tree_=None,
classes_=None,
n_classes_=None,
class_weight=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
max_features_=getattr(self._clf, 'max_features_', None),
n_features_=getattr(self._clf, 'n_features_', None),
n_outputs_=getattr(self._clf, 'n_outputs_', None),
tree_=getattr(self._clf, 'tree_', None),
classes_=getattr(self._clf, 'classes_', None),
n_classes_=getattr(self._clf, 'n_classes_', None),
class_weight=getattr(self._clf, 'class_weight', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.max_features_ = params['max_features_']
self._clf.n_features_ = params['n_features_']
self._clf.n_outputs_ = params['n_outputs_']
self._clf.tree_ = params['tree_']
self._clf.classes_ = params['classes_']
self._clf.n_classes_ = params['n_classes_']
self._clf.class_weight = params['class_weight']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['max_features_'] is not None:
self._fitted = True
if params['n_features_'] is not None:
self._fitted = True
if params['n_outputs_'] is not None:
self._fitted = True
if params['tree_'] is not None:
self._fitted = True
if params['classes_'] is not None:
self._fitted = True
if params['n_classes_'] is not None:
self._fitted = True
if params['class_weight'] is not None:
self._fitted = True




def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]:
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names))))
output.columns = self._input_column_names
for i in range(len(self._input_column_names)):
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]})
return CallResult(output)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKDecisionTreeRegressor.__doc__ = DecisionTreeRegressor.__doc__

+ 0
- 503
common-primitives/sklearn-wrap/sklearn_wrap/SKDummyClassifier.py View File

@@ -1,503 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.dummy import DummyClassifier


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
classes_: Optional[ndarray]
n_classes_: Optional[Union[int,ndarray]]
class_prior_: Optional[ndarray]
n_outputs_: Optional[int]
sparse_output_: Optional[bool]
output_2d_: Optional[bool]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
strategy = hyperparams.Choice(
choices={
'stratified': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'most_frequent': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'prior': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'uniform': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'constant': hyperparams.Hyperparams.define(
configuration=OrderedDict({
'constant': hyperparams.Union(
configuration=OrderedDict({
'str': hyperparams.Hyperparameter[str](
default='one',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'int': hyperparams.Bounded[int](
default=1,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'ndarray': hyperparams.Hyperparameter[ndarray](
default=numpy.array([]),
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='int',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
})
)
},
default='stratified',
description='Strategy to use to generate predictions. * "stratified": generates predictions by respecting the training set\'s class distribution. * "most_frequent": always predicts the most frequent label in the training set. * "prior": always predicts the class that maximizes the class prior (like "most_frequent") and ``predict_proba`` returns the class prior. * "uniform": generates predictions uniformly at random. * "constant": always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class .. versionadded:: 0.17 Dummy Classifier now supports prior fitting strategy using parameter *prior*.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKDummyClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn DummyClassifier
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.RULE_BASED_MACHINE_LEARNING, ],
"name": "sklearn.dummy.DummyClassifier",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.dummy.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html']},
"version": "2019.11.13",
"id": "a1056ddf-2e89-3d8d-8308-2146170ae54d",
"hyperparams_to_tune": ['strategy'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = DummyClassifier(
strategy=self.hyperparams['strategy']['choice'],
constant=self.hyperparams['strategy'].get('constant', 'int'),
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
classes_=None,
n_classes_=None,
class_prior_=None,
n_outputs_=None,
sparse_output_=None,
output_2d_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
classes_=getattr(self._clf, 'classes_', None),
n_classes_=getattr(self._clf, 'n_classes_', None),
class_prior_=getattr(self._clf, 'class_prior_', None),
n_outputs_=getattr(self._clf, 'n_outputs_', None),
sparse_output_=getattr(self._clf, 'sparse_output_', None),
output_2d_=getattr(self._clf, 'output_2d_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.classes_ = params['classes_']
self._clf.n_classes_ = params['n_classes_']
self._clf.class_prior_ = params['class_prior_']
self._clf.n_outputs_ = params['n_outputs_']
self._clf.sparse_output_ = params['sparse_output_']
self._clf.output_2d_ = params['output_2d_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['classes_'] is not None:
self._fitted = True
if params['n_classes_'] is not None:
self._fitted = True
if params['class_prior_'] is not None:
self._fitted = True
if params['n_outputs_'] is not None:
self._fitted = True
if params['sparse_output_'] is not None:
self._fitted = True
if params['output_2d_'] is not None:
self._fitted = True


def log_likelihoods(self, *,
outputs: Outputs,
inputs: Inputs,
timeout: float = None,
iterations: int = None) -> CallResult[Sequence[float]]:
inputs = inputs.iloc[:, self._training_indices] # Get ndarray
outputs = outputs.iloc[:, self._target_column_indices]

if len(inputs.columns) and len(outputs.columns):

if outputs.shape[1] != self._clf.n_outputs_:
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")

log_proba = self._clf.predict_log_proba(inputs)

# Making it always a list, even when only one target.
if self._clf.n_outputs_ == 1:
log_proba = [log_proba]
classes = [self._clf.classes_]
else:
classes = self._clf.classes_

samples_length = inputs.shape[0]

log_likelihoods = []
for k in range(self._clf.n_outputs_):
# We have to map each class to its internal (numerical) index used in the learner.
# This allows "outputs" to contain string classes.
outputs_column = outputs.iloc[:, k]
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
mapped_outputs_column = outputs_column.map(classes_map)

# For each target column (column in "outputs"), for each sample (row) we pick the log
# likelihood for a given class.
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])

results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
results.columns = outputs.columns

for k in range(self._clf.n_outputs_):
column_metadata = outputs.metadata.query_column(k)
if 'name' in column_metadata:
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})

else:
results = d3m_dataframe(generate_metadata=True)

return CallResult(results)


@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKDummyClassifier.__doc__ = DummyClassifier.__doc__

+ 0
- 442
common-primitives/sklearn-wrap/sklearn_wrap/SKDummyRegressor.py View File

@@ -1,442 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.dummy import DummyRegressor


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
constant_: Optional[Union[float, ndarray]]
n_outputs_: Optional[int]
output_2d_: Optional[bool]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
strategy = hyperparams.Choice(
choices={
'mean': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'median': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'quantile': hyperparams.Hyperparams.define(
configuration=OrderedDict({
'quantile': hyperparams.Uniform(
default=0.5,
lower=0,
upper=1.0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
})
),
'constant': hyperparams.Hyperparams.define(
configuration=OrderedDict({
'constant': hyperparams.Union(
configuration=OrderedDict({
'float': hyperparams.Bounded[float](
lower=0,
upper=None,
default=1.0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'int': hyperparams.Bounded[int](
default=1,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'ndarray': hyperparams.Hyperparameter[ndarray](
default=numpy.array([]),
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='float',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
})
)
},
default='mean',
description='Strategy to use to generate predictions. * "mean": always predicts the mean of the training set * "median": always predicts the median of the training set * "quantile": always predicts a specified quantile of the training set, provided with the quantile parameter. * "constant": always predicts a constant value that is provided by the user.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKDummyRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn DummyRegressor
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.RULE_BASED_MACHINE_LEARNING, ],
"name": "sklearn.dummy.DummyRegressor",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.dummy.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html']},
"version": "2019.11.13",
"id": "05aa5b6a-3b27-34dc-9ba7-8511fb13f253",
"hyperparams_to_tune": ['strategy'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = DummyRegressor(
strategy=self.hyperparams['strategy']['choice'],
quantile=self.hyperparams['strategy'].get('quantile', 0.5),
constant=self.hyperparams['strategy'].get('constant', 'float'),
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
constant_=None,
n_outputs_=None,
output_2d_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
constant_=getattr(self._clf, 'constant_', None),
n_outputs_=getattr(self._clf, 'n_outputs_', None),
output_2d_=getattr(self._clf, 'output_2d_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.constant_ = params['constant_']
self._clf.n_outputs_ = params['n_outputs_']
self._clf.output_2d_ = params['output_2d_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['constant_'] is not None:
self._fitted = True
if params['n_outputs_'] is not None:
self._fitted = True
if params['output_2d_'] is not None:
self._fitted = True




@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKDummyRegressor.__doc__ = DummyRegressor.__doc__

+ 0
- 466
common-primitives/sklearn-wrap/sklearn_wrap/SKElasticNet.py View File

@@ -1,466 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.linear_model.coordinate_descent import ElasticNet


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
coef_: Optional[ndarray]
intercept_: Optional[float]
n_iter_: Optional[int]
dual_gap_: Optional[float]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
alpha = hyperparams.Bounded[float](
default=1.0,
lower=0,
upper=None,
description='Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter.``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
l1_ratio = hyperparams.Uniform(
default=0.5,
lower=0,
upper=1,
description='The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
fit_intercept = hyperparams.UniformBool(
default=True,
description='Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
normalize = hyperparams.UniformBool(
default=False,
description='This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
precompute = hyperparams.UniformBool(
default=False,
description='Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``True`` to preserve sparsity.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter']
)
max_iter = hyperparams.Bounded[int](
default=1000,
lower=0,
upper=None,
description='The maximum number of iterations',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
tol = hyperparams.Bounded[float](
default=0.0001,
lower=0,
upper=None,
description='The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
positive = hyperparams.UniformBool(
default=False,
description='When set to ``True``, forces the coefficients to be positive.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
selection = hyperparams.Enumeration[str](
default='cyclic',
values=['cyclic', 'random'],
description='If set to \'random\', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to \'random\') often leads to significantly faster convergence especially when tol is higher than 1e-4.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
warm_start = hyperparams.UniformBool(
default=False,
description='When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary <warm_start>`.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKElasticNet(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn ElasticNet
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ELASTIC_NET_REGULARIZATION, ],
"name": "sklearn.linear_model.coordinate_descent.ElasticNet",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.elastic_net.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html']},
"version": "2019.11.13",
"id": "a85d4ffb-49ab-35b1-a70c-6df209312aae",
"hyperparams_to_tune": ['alpha', 'max_iter', 'l1_ratio'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = ElasticNet(
alpha=self.hyperparams['alpha'],
l1_ratio=self.hyperparams['l1_ratio'],
fit_intercept=self.hyperparams['fit_intercept'],
normalize=self.hyperparams['normalize'],
precompute=self.hyperparams['precompute'],
max_iter=self.hyperparams['max_iter'],
tol=self.hyperparams['tol'],
positive=self.hyperparams['positive'],
selection=self.hyperparams['selection'],
warm_start=self.hyperparams['warm_start'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
coef_=None,
intercept_=None,
n_iter_=None,
dual_gap_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
coef_=getattr(self._clf, 'coef_', None),
intercept_=getattr(self._clf, 'intercept_', None),
n_iter_=getattr(self._clf, 'n_iter_', None),
dual_gap_=getattr(self._clf, 'dual_gap_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.coef_ = params['coef_']
self._clf.intercept_ = params['intercept_']
self._clf.n_iter_ = params['n_iter_']
self._clf.dual_gap_ = params['dual_gap_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['coef_'] is not None:
self._fitted = True
if params['intercept_'] is not None:
self._fitted = True
if params['n_iter_'] is not None:
self._fitted = True
if params['dual_gap_'] is not None:
self._fitted = True




@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKElasticNet.__doc__ = ElasticNet.__doc__

+ 0
- 675
common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesClassifier.py View File

@@ -1,675 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.ensemble.forest import ExtraTreesClassifier


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
estimators_: Optional[Sequence[sklearn.base.BaseEstimator]]
classes_: Optional[Union[ndarray, List[ndarray]]]
n_classes_: Optional[Union[int, List[int]]]
n_features_: Optional[int]
n_outputs_: Optional[int]
oob_score_: Optional[float]
oob_decision_function_: Optional[ndarray]
base_estimator_: Optional[object]
estimator_params: Optional[tuple]
base_estimator: Optional[object]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
n_estimators = hyperparams.Bounded[int](
default=10,
lower=1,
upper=None,
description='The number of trees in the forest.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
criterion = hyperparams.Enumeration[str](
values=['gini', 'entropy'],
default='gini',
description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_depth = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=10,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_split = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
default=2,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_leaf = hyperparams.Union(
configuration=OrderedDict({
'absolute': hyperparams.Bounded[int](
default=1,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=0.5,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_weight_fraction_leaf = hyperparams.Bounded[float](
default=0,
lower=0,
upper=0.5,
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_leaf_nodes = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
default=10,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_features = hyperparams.Union(
configuration=OrderedDict({
'specified_int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'calculated': hyperparams.Enumeration[str](
values=['auto', 'sqrt', 'log2'],
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='calculated',
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_impurity_decrease = hyperparams.Bounded[float](
default=0.0,
lower=0.0,
upper=None,
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
bootstrap = hyperparams.Enumeration[str](
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'],
default='bootstrap',
description='Whether bootstrap samples are used when building trees.'
' And whether to use out-of-bag samples to estimate the generalization accuracy.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
n_jobs = hyperparams.Union(
configuration=OrderedDict({
'limit': hyperparams.Bounded[int](
default=1,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'all_cores': hyperparams.Constant(
default=-1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='limit',
description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter']
)
warm_start = hyperparams.UniformBool(
default=False,
description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
class_weight = hyperparams.Union(
configuration=OrderedDict({
'str': hyperparams.Enumeration[str](
default='balanced',
values=['balanced', 'balanced_subsample'],
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKExtraTreesClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn ExtraTreesClassifier
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ],
"name": "sklearn.ensemble.forest.ExtraTreesClassifier",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.extra_trees.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html']},
"version": "2019.11.13",
"id": "c8a28f02-ef4a-35a8-87f1-cf79980f5c3e",
"hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None,
_verbose: int = 0) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = ExtraTreesClassifier(
n_estimators=self.hyperparams['n_estimators'],
criterion=self.hyperparams['criterion'],
max_depth=self.hyperparams['max_depth'],
min_samples_split=self.hyperparams['min_samples_split'],
min_samples_leaf=self.hyperparams['min_samples_leaf'],
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'],
max_leaf_nodes=self.hyperparams['max_leaf_nodes'],
max_features=self.hyperparams['max_features'],
min_impurity_decrease=self.hyperparams['min_impurity_decrease'],
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'],
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'],
n_jobs=self.hyperparams['n_jobs'],
warm_start=self.hyperparams['warm_start'],
class_weight=self.hyperparams['class_weight'],
random_state=self.random_seed,
verbose=_verbose
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
estimators_=None,
classes_=None,
n_classes_=None,
n_features_=None,
n_outputs_=None,
oob_score_=None,
oob_decision_function_=None,
base_estimator_=None,
estimator_params=None,
base_estimator=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
estimators_=getattr(self._clf, 'estimators_', None),
classes_=getattr(self._clf, 'classes_', None),
n_classes_=getattr(self._clf, 'n_classes_', None),
n_features_=getattr(self._clf, 'n_features_', None),
n_outputs_=getattr(self._clf, 'n_outputs_', None),
oob_score_=getattr(self._clf, 'oob_score_', None),
oob_decision_function_=getattr(self._clf, 'oob_decision_function_', None),
base_estimator_=getattr(self._clf, 'base_estimator_', None),
estimator_params=getattr(self._clf, 'estimator_params', None),
base_estimator=getattr(self._clf, 'base_estimator', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.estimators_ = params['estimators_']
self._clf.classes_ = params['classes_']
self._clf.n_classes_ = params['n_classes_']
self._clf.n_features_ = params['n_features_']
self._clf.n_outputs_ = params['n_outputs_']
self._clf.oob_score_ = params['oob_score_']
self._clf.oob_decision_function_ = params['oob_decision_function_']
self._clf.base_estimator_ = params['base_estimator_']
self._clf.estimator_params = params['estimator_params']
self._clf.base_estimator = params['base_estimator']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['estimators_'] is not None:
self._fitted = True
if params['classes_'] is not None:
self._fitted = True
if params['n_classes_'] is not None:
self._fitted = True
if params['n_features_'] is not None:
self._fitted = True
if params['n_outputs_'] is not None:
self._fitted = True
if params['oob_score_'] is not None:
self._fitted = True
if params['oob_decision_function_'] is not None:
self._fitted = True
if params['base_estimator_'] is not None:
self._fitted = True
if params['estimator_params'] is not None:
self._fitted = True
if params['base_estimator'] is not None:
self._fitted = True


def log_likelihoods(self, *,
outputs: Outputs,
inputs: Inputs,
timeout: float = None,
iterations: int = None) -> CallResult[Sequence[float]]:
inputs = inputs.iloc[:, self._training_indices] # Get ndarray
outputs = outputs.iloc[:, self._target_column_indices]

if len(inputs.columns) and len(outputs.columns):

if outputs.shape[1] != self._clf.n_outputs_:
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")

log_proba = self._clf.predict_log_proba(inputs)

# Making it always a list, even when only one target.
if self._clf.n_outputs_ == 1:
log_proba = [log_proba]
classes = [self._clf.classes_]
else:
classes = self._clf.classes_

samples_length = inputs.shape[0]

log_likelihoods = []
for k in range(self._clf.n_outputs_):
# We have to map each class to its internal (numerical) index used in the learner.
# This allows "outputs" to contain string classes.
outputs_column = outputs.iloc[:, k]
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
mapped_outputs_column = outputs_column.map(classes_map)

# For each target column (column in "outputs"), for each sample (row) we pick the log
# likelihood for a given class.
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])

results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
results.columns = outputs.columns

for k in range(self._clf.n_outputs_):
column_metadata = outputs.metadata.query_column(k)
if 'name' in column_metadata:
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})

else:
results = d3m_dataframe(generate_metadata=True)

return CallResult(results)


def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]:
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names))))
output.columns = self._input_column_names
for i in range(len(self._input_column_names)):
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]})
return CallResult(output)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKExtraTreesClassifier.__doc__ = ExtraTreesClassifier.__doc__

+ 0
- 607
common-primitives/sklearn-wrap/sklearn_wrap/SKExtraTreesRegressor.py View File

@@ -1,607 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.ensemble.forest import ExtraTreesRegressor


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
estimators_: Optional[List[sklearn.tree.ExtraTreeRegressor]]
n_features_: Optional[int]
n_outputs_: Optional[int]
oob_score_: Optional[float]
oob_prediction_: Optional[ndarray]
base_estimator_: Optional[object]
estimator_params: Optional[tuple]
class_weight: Optional[Union[str, dict, List[dict]]]
base_estimator: Optional[object]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
n_estimators = hyperparams.Bounded[int](
default=10,
lower=1,
upper=None,
description='The number of trees in the forest.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
criterion = hyperparams.Enumeration[str](
values=['mse', 'mae'],
default='mse',
description='The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_depth = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=5,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_split = hyperparams.Union(
configuration=OrderedDict({
'float': hyperparams.Bounded[float](
lower=0,
upper=1,
default=1.0,
description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=2,
description='Minimum number.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='int',
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_samples_leaf = hyperparams.Union(
configuration=OrderedDict({
'percent': hyperparams.Bounded[float](
lower=0,
upper=0.5,
default=0.25,
description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'absolute': hyperparams.Bounded[int](
lower=1,
upper=None,
default=1,
description='Minimum number.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='absolute',
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_weight_fraction_leaf = hyperparams.Bounded[float](
default=0,
lower=0,
upper=0.5,
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_leaf_nodes = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=10,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_features = hyperparams.Union(
configuration=OrderedDict({
'specified_int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'calculated': hyperparams.Enumeration[str](
values=['auto', 'sqrt', 'log2'],
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'percent': hyperparams.Bounded[float](
default=0.25,
lower=0,
upper=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='calculated',
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
min_impurity_decrease = hyperparams.Bounded[float](
default=0.0,
lower=0.0,
upper=None,
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
bootstrap = hyperparams.Enumeration[str](
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'],
default='bootstrap',
description='Whether bootstrap samples are used when building trees.'
' And whether to use out-of-bag samples to estimate the generalization accuracy.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
warm_start = hyperparams.UniformBool(
default=False,
description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
n_jobs = hyperparams.Union(
configuration=OrderedDict({
'limit': hyperparams.Bounded[int](
default=1,
lower=1,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'all_cores': hyperparams.Constant(
default=-1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='limit',
description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKExtraTreesRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn ExtraTreesRegressor
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ],
"name": "sklearn.ensemble.forest.ExtraTreesRegressor",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.extra_trees.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html']},
"version": "2019.11.13",
"id": "35321059-2a1a-31fd-9509-5494efc751c7",
"hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None,
_verbose: int = 0) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = ExtraTreesRegressor(
n_estimators=self.hyperparams['n_estimators'],
criterion=self.hyperparams['criterion'],
max_depth=self.hyperparams['max_depth'],
min_samples_split=self.hyperparams['min_samples_split'],
min_samples_leaf=self.hyperparams['min_samples_leaf'],
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'],
max_leaf_nodes=self.hyperparams['max_leaf_nodes'],
max_features=self.hyperparams['max_features'],
min_impurity_decrease=self.hyperparams['min_impurity_decrease'],
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'],
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'],
warm_start=self.hyperparams['warm_start'],
n_jobs=self.hyperparams['n_jobs'],
random_state=self.random_seed,
verbose=_verbose
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
estimators_=None,
n_features_=None,
n_outputs_=None,
oob_score_=None,
oob_prediction_=None,
base_estimator_=None,
estimator_params=None,
class_weight=None,
base_estimator=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
estimators_=getattr(self._clf, 'estimators_', None),
n_features_=getattr(self._clf, 'n_features_', None),
n_outputs_=getattr(self._clf, 'n_outputs_', None),
oob_score_=getattr(self._clf, 'oob_score_', None),
oob_prediction_=getattr(self._clf, 'oob_prediction_', None),
base_estimator_=getattr(self._clf, 'base_estimator_', None),
estimator_params=getattr(self._clf, 'estimator_params', None),
class_weight=getattr(self._clf, 'class_weight', None),
base_estimator=getattr(self._clf, 'base_estimator', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.estimators_ = params['estimators_']
self._clf.n_features_ = params['n_features_']
self._clf.n_outputs_ = params['n_outputs_']
self._clf.oob_score_ = params['oob_score_']
self._clf.oob_prediction_ = params['oob_prediction_']
self._clf.base_estimator_ = params['base_estimator_']
self._clf.estimator_params = params['estimator_params']
self._clf.class_weight = params['class_weight']
self._clf.base_estimator = params['base_estimator']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['estimators_'] is not None:
self._fitted = True
if params['n_features_'] is not None:
self._fitted = True
if params['n_outputs_'] is not None:
self._fitted = True
if params['oob_score_'] is not None:
self._fitted = True
if params['oob_prediction_'] is not None:
self._fitted = True
if params['base_estimator_'] is not None:
self._fitted = True
if params['estimator_params'] is not None:
self._fitted = True
if params['class_weight'] is not None:
self._fitted = True
if params['base_estimator'] is not None:
self._fitted = True




def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]:
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names))))
output.columns = self._input_column_names
for i in range(len(self._input_column_names)):
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]})
return CallResult(output)
@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKExtraTreesRegressor.__doc__ = ExtraTreesRegressor.__doc__

+ 0
- 439
common-primitives/sklearn-wrap/sklearn_wrap/SKFastICA.py View File

@@ -1,439 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.decomposition.fastica_ import FastICA


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase


Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
n_iter_: Optional[int]
mixing_: Optional[ndarray]
components_: Optional[ndarray]
mean_: Optional[ndarray]
whitening_: Optional[ndarray]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
n_components = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=0,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
description='All components are used.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Number of components to extract. If None no dimension reduction is performed.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
algorithm = hyperparams.Enumeration[str](
default='parallel',
values=['parallel', 'deflation'],
description='Apply a parallel or deflational FASTICA algorithm.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
whiten = hyperparams.UniformBool(
default=True,
description='If True perform an initial whitening of the data. If False, the data is assumed to have already been preprocessed: it should be centered, normed and white. Otherwise you will get incorrect results. In this case the parameter n_components will be ignored.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
fun = hyperparams.Choice(
choices={
'logcosh': hyperparams.Hyperparams.define(
configuration=OrderedDict({
'alpha': hyperparams.Hyperparameter[float](
default=1,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
})
),
'exp': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
),
'cube': hyperparams.Hyperparams.define(
configuration=OrderedDict({})
)
},
default='logcosh',
description='The functional form of the G function used in the approximation to neg-entropy. Could be either \'logcosh\', \'exp\', or \'cube\'. You can also provide your own function. It should return a tuple containing the value of the function, and of its derivative, in the point. Example: def my_g(x): return x ** 3, 3 * x ** 2',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
max_iter = hyperparams.Bounded[int](
default=200,
lower=0,
upper=None,
description='Maximum number of iterations to perform. tol: float, optional A positive scalar giving the tolerance at which the un-mixing matrix is considered to have converged.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
tol = hyperparams.Bounded[float](
default=0.0001,
lower=0,
upper=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
w_init = hyperparams.Union(
configuration=OrderedDict({
'ndarray': hyperparams.Hyperparameter[ndarray](
default=numpy.array([]),
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'none': hyperparams.Constant(
default=None,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='none',
description='Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.\'s is used.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)
use_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
default='https://metadata.datadrivendiscovery.org/types/Attribute',
description='Decides what semantic type to attach to generated attributes',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKFastICA(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn FastICA
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, ],
"name": "sklearn.decomposition.fastica_.FastICA",
"primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
"python_path": "d3m.primitives.data_transformation.fast_ica.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html']},
"version": "2019.11.13",
"id": "03633ffa-425e-37d4-9f1c-bbb552f1e995",
"hyperparams_to_tune": ['n_components', 'algorithm'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = FastICA(
n_components=self.hyperparams['n_components'],
algorithm=self.hyperparams['algorithm'],
whiten=self.hyperparams['whiten'],
fun=self.hyperparams['fun']['choice'],
fun_args=self.hyperparams['fun'],
max_iter=self.hyperparams['max_iter'],
tol=self.hyperparams['tol'],
w_init=self.hyperparams['w_init'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
def set_training_data(self, *, inputs: Inputs) -> None:
self._inputs = inputs
self._fitted = False
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._fitted:
return CallResult(None)

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if self._training_inputs is None:
return CallResult(None)

if len(self._training_indices) > 0:
self._clf.fit(self._training_inputs)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
sk_inputs = inputs.iloc[:, self._training_indices]
output_columns = []
if len(self._training_indices) > 0:
sk_output = self._clf.transform(sk_inputs)
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
outputs = self._wrap_predictions(inputs, sk_output)
if len(outputs.columns) == len(self._input_column_names):
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._training_indices,
columns_list=output_columns)
return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
n_iter_=None,
mixing_=None,
components_=None,
mean_=None,
whitening_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
n_iter_=getattr(self._clf, 'n_iter_', None),
mixing_=getattr(self._clf, 'mixing_', None),
components_=getattr(self._clf, 'components_', None),
mean_=getattr(self._clf, 'mean_', None),
whitening_=getattr(self._clf, 'whitening_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.n_iter_ = params['n_iter_']
self._clf.mixing_ = params['mixing_']
self._clf.components_ = params['components_']
self._clf.mean_ = params['mean_']
self._clf.whitening_ = params['whitening_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['n_iter_'] is not None:
self._fitted = True
if params['mixing_'] is not None:
self._fitted = True
if params['components_'] is not None:
self._fitted = True
if params['mean_'] is not None:
self._fitted = True
if params['whitening_'] is not None:
self._fitted = True



@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_columns'],
exclude_columns=hyperparams['exclude_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=True)
target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
return outputs


@classmethod
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int],
outputs_metadata: metadata_base.DataMetadata, hyperparams):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in input_indices:
column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)

column_metadata = OrderedDict(inputs_metadata.query_column(column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = set()
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

# If outputs has more columns than index, add Attribute Type to all remaining
if outputs_length > len(input_indices):
for column_index in range(len(input_indices), outputs_length):
column_metadata = OrderedDict()
semantic_types = set()
semantic_types.add(hyperparams["return_semantic_type"])
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = list(semantic_types)
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKFastICA.__doc__ = FastICA.__doc__

+ 0
- 361
common-primitives/sklearn-wrap/sklearn_wrap/SKFeatureAgglomeration.py View File

@@ -1,361 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.cluster.hierarchical import FeatureAgglomeration
from numpy import mean as npmean


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase


Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
labels_: Optional[ndarray]
n_leaves_: Optional[int]
children_: Optional[ndarray]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
n_clusters = hyperparams.Bounded[int](
default=2,
lower=0,
upper=None,
description='The number of clusters to find.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
affinity = hyperparams.Enumeration[str](
default='euclidean',
values=['euclidean', 'l1', 'l2', 'manhattan', 'cosine', 'precomputed'],
description='Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or \'precomputed\'. If linkage is "ward", only "euclidean" is accepted.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
compute_full_tree = hyperparams.Union(
configuration=OrderedDict({
'auto': hyperparams.Constant(
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'bool': hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='auto',
description='Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of features. This option is useful only when specifying a connectivity matrix. Note also that when varying the number of clusters and using caching, it may be advantageous to compute the full tree.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
linkage = hyperparams.Enumeration[str](
default='ward',
values=['ward', 'complete', 'average', 'single'],
description='Which linkage criterion to use. The linkage criterion determines which distance to use between sets of features. The algorithm will merge the pairs of cluster that minimize this criterion. - ward minimizes the variance of the clusters being merged. - average uses the average of the distances of each feature of the two sets. - complete or maximum linkage uses the maximum distances between all features of the two sets.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
default='https://metadata.datadrivendiscovery.org/types/Attribute',
description='Decides what semantic type to attach to generated attributes',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKFeatureAgglomeration(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn FeatureAgglomeration
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_STREAM_CLUSTERING, ],
"name": "sklearn.cluster.hierarchical.FeatureAgglomeration",
"primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING,
"python_path": "d3m.primitives.data_preprocessing.feature_agglomeration.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html']},
"version": "2019.11.13",
"id": "f259b009-5e0f-37b1-b117-441aba2b65c8",
"hyperparams_to_tune": ['n_clusters', 'affinity', 'linkage'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = FeatureAgglomeration(
n_clusters=self.hyperparams['n_clusters'],
affinity=self.hyperparams['affinity'],
compute_full_tree=self.hyperparams['compute_full_tree'],
linkage=self.hyperparams['linkage'],
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
def set_training_data(self, *, inputs: Inputs) -> None:
self._inputs = inputs
self._fitted = False
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._fitted:
return CallResult(None)

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if self._training_inputs is None:
return CallResult(None)

if len(self._training_indices) > 0:
self._clf.fit(self._training_inputs)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
sk_inputs = inputs.iloc[:, self._training_indices]
output_columns = []
if len(self._training_indices) > 0:
sk_output = self._clf.transform(sk_inputs)
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
outputs = self._wrap_predictions(inputs, sk_output)
if len(outputs.columns) == len(self._input_column_names):
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._training_indices,
columns_list=output_columns)
return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
labels_=None,
n_leaves_=None,
children_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
labels_=getattr(self._clf, 'labels_', None),
n_leaves_=getattr(self._clf, 'n_leaves_', None),
children_=getattr(self._clf, 'children_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.labels_ = params['labels_']
self._clf.n_leaves_ = params['n_leaves_']
self._clf.children_ = params['children_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['labels_'] is not None:
self._fitted = True
if params['n_leaves_'] is not None:
self._fitted = True
if params['children_'] is not None:
self._fitted = True



@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_columns'],
exclude_columns=hyperparams['exclude_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=True)
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):

outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_name = "output_{}".format(column_index)
column_metadata = OrderedDict()
semantic_types = set()
semantic_types.add(hyperparams["return_semantic_type"])
column_metadata['semantic_types'] = list(semantic_types)

column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKFeatureAgglomeration.__doc__ = FeatureAgglomeration.__doc__

+ 0
- 492
common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianNB.py View File

@@ -1,492 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.naive_bayes import GaussianNB


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
class_prior_: Optional[ndarray]
class_count_: Optional[ndarray]
theta_: Optional[ndarray]
sigma_: Optional[ndarray]
classes_: Optional[ndarray]
epsilon_: Optional[float]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
var_smoothing = hyperparams.Bounded[float](
lower=0,
upper=None,
default=1e-09,
description='Portion of the largest variance of all features that is added to variances for calculation stability.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKGaussianNB(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams],
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams],
ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn GaussianNB
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER, ],
"name": "sklearn.naive_bayes.GaussianNB",
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION,
"python_path": "d3m.primitives.classification.gaussian_naive_bayes.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html']},
"version": "2019.11.13",
"id": "464783a8-771e-340d-999b-ae90b9f84f0b",
"hyperparams_to_tune": ['var_smoothing'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None,
_priors: Union[ndarray, None] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = GaussianNB(
var_smoothing=self.hyperparams['var_smoothing'],
priors=_priors
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._training_inputs is None or self._training_outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.partial_fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
class_prior_=None,
class_count_=None,
theta_=None,
sigma_=None,
classes_=None,
epsilon_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
class_prior_=getattr(self._clf, 'class_prior_', None),
class_count_=getattr(self._clf, 'class_count_', None),
theta_=getattr(self._clf, 'theta_', None),
sigma_=getattr(self._clf, 'sigma_', None),
classes_=getattr(self._clf, 'classes_', None),
epsilon_=getattr(self._clf, 'epsilon_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.class_prior_ = params['class_prior_']
self._clf.class_count_ = params['class_count_']
self._clf.theta_ = params['theta_']
self._clf.sigma_ = params['sigma_']
self._clf.classes_ = params['classes_']
self._clf.epsilon_ = params['epsilon_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['class_prior_'] is not None:
self._fitted = True
if params['class_count_'] is not None:
self._fitted = True
if params['theta_'] is not None:
self._fitted = True
if params['sigma_'] is not None:
self._fitted = True
if params['classes_'] is not None:
self._fitted = True
if params['epsilon_'] is not None:
self._fitted = True


def log_likelihoods(self, *,
outputs: Outputs,
inputs: Inputs,
timeout: float = None,
iterations: int = None) -> CallResult[Sequence[float]]:
inputs = inputs.iloc[:, self._training_indices] # Get ndarray
outputs = outputs.iloc[:, self._target_column_indices]

if len(inputs.columns) and len(outputs.columns):

if outputs.shape[1] != self._clf.n_outputs_:
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.")

log_proba = self._clf.predict_log_proba(inputs)

# Making it always a list, even when only one target.
if self._clf.n_outputs_ == 1:
log_proba = [log_proba]
classes = [self._clf.classes_]
else:
classes = self._clf.classes_

samples_length = inputs.shape[0]

log_likelihoods = []
for k in range(self._clf.n_outputs_):
# We have to map each class to its internal (numerical) index used in the learner.
# This allows "outputs" to contain string classes.
outputs_column = outputs.iloc[:, k]
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k])
mapped_outputs_column = outputs_column.map(classes_map)

# For each target column (column in "outputs"), for each sample (row) we pick the log
# likelihood for a given class.
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column])

results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True)
results.columns = outputs.columns

for k in range(self._clf.n_outputs_):
column_metadata = outputs.metadata.query_column(k)
if 'name' in column_metadata:
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']})

else:
results = d3m_dataframe(generate_metadata=True)

return CallResult(results)


@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKGaussianNB.__doc__ = GaussianNB.__doc__

+ 0
- 463
common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianProcessRegressor.py View File

@@ -1,463 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.gaussian_process.gpr import GaussianProcessRegressor


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer

from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin
from d3m import exceptions
import pandas



Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
X_train_: Optional[ndarray]
y_train_: Optional[ndarray]
kernel_: Optional[Callable]
alpha_: Optional[ndarray]
log_marginal_likelihood_value_: Optional[float]
_y_train_mean: Optional[ndarray]
_rng: Optional[numpy.random.mtrand.RandomState]
L_: Optional[ndarray]
_K_inv: Optional[object]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
alpha = hyperparams.Union(
configuration=OrderedDict({
'float': hyperparams.Hyperparameter[float](
default=1e-10,
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'ndarray': hyperparams.Hyperparameter[ndarray](
default=numpy.array([]),
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='float',
description='Value added to the diagonal of the kernel matrix during fitting. Larger values correspond to increased noise level in the observations and reduce potential numerical issue during fitting. If an array is passed, it must have the same number of entries as the data used for fitting and is used as datapoint-dependent noise level. Note that this is equivalent to adding a WhiteKernel with c=alpha. Allowing to specify the noise level directly as a parameter is mainly for convenience and for consistency with Ridge.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
optimizer = hyperparams.Constant(
default='fmin_l_bfgs_b',
description='Can either be one of the internally supported optimizers for optimizing the kernel\'s parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * \'obj_func\' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * \'initial_theta\': the initial value for theta, which can be # used by local optimizers # * \'bounds\': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the \'fmin_l_bfgs_b\' algorithm from scipy.optimize is used. If None is passed, the kernel\'s parameters are kept fixed. Available internal optimizers are:: \'fmin_l_bfgs_b\'',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
n_restarts_optimizer = hyperparams.Bounded[int](
default=0,
lower=0,
upper=None,
description='The number of restarts of the optimizer for finding the kernel\'s parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel\'s initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer == 0 implies that one run is performed.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
normalize_y = hyperparams.UniformBool(
default=False,
description='Whether the target values y are normalized, i.e., the mean of the observed target values become zero. This parameter should be set to True if the target values\' mean is expected to differ considerable from zero. When enabled, the normalization effectively modifies the GP\'s prior based on the data, which contradicts the likelihood principle; normalization is thus disabled per default. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.",
)
use_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.",
)
exclude_inputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.",
)
exclude_outputs_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'],
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget',
description='Decides what semantic type to attach to generated output',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKGaussianProcessRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn GaussianProcessRegressor
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.GAUSSIAN_PROCESS, ],
"name": "sklearn.gaussian_process.gpr.GaussianProcessRegressor",
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION,
"python_path": "d3m.primitives.regression.gaussian_process.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html']},
"version": "2019.11.13",
"id": "3894e630-d67b-35d9-ab78-233e264f6324",
"hyperparams_to_tune": ['alpha'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = GaussianProcessRegressor(
alpha=self.hyperparams['alpha'],
optimizer=self.hyperparams['optimizer'],
n_restarts_optimizer=self.hyperparams['n_restarts_optimizer'],
normalize_y=self.hyperparams['normalize_y'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
self._new_training_data = False
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
self._inputs = inputs
self._outputs = outputs
self._fitted = False
self._new_training_data = True
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._inputs is None or self._outputs is None:
raise ValueError("Missing training data.")

if not self._new_training_data:
return CallResult(None)
self._new_training_data = False

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if len(self._training_indices) > 0 and len(self._target_column_indices) > 0:
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams)
sk_training_output = self._training_outputs.values

shape = sk_training_output.shape
if len(shape) == 2 and shape[1] == 1:
sk_training_output = numpy.ravel(sk_training_output)

self._clf.fit(self._training_inputs, sk_training_output)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")

return CallResult(None)

def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams)
output = []
if len(sk_inputs.columns):
try:
sk_output = self._clf.predict(sk_inputs)
except sklearn.exceptions.NotFittedError as error:
raise PrimitiveNotFittedError("Primitive not fitted.") from error
# For primitives that allow predicting without fitting like GaussianProcessRegressor
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
output = self._wrap_predictions(inputs, sk_output)
output.columns = self._target_names
output = [output]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._target_column_indices,
columns_list=output)

return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
X_train_=None,
y_train_=None,
kernel_=None,
alpha_=None,
log_marginal_likelihood_value_=None,
_y_train_mean=None,
_rng=None,
L_=None,
_K_inv=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
X_train_=getattr(self._clf, 'X_train_', None),
y_train_=getattr(self._clf, 'y_train_', None),
kernel_=getattr(self._clf, 'kernel_', None),
alpha_=getattr(self._clf, 'alpha_', None),
log_marginal_likelihood_value_=getattr(self._clf, 'log_marginal_likelihood_value_', None),
_y_train_mean=getattr(self._clf, '_y_train_mean', None),
_rng=getattr(self._clf, '_rng', None),
L_=getattr(self._clf, 'L_', None),
_K_inv=getattr(self._clf, '_K_inv', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.X_train_ = params['X_train_']
self._clf.y_train_ = params['y_train_']
self._clf.kernel_ = params['kernel_']
self._clf.alpha_ = params['alpha_']
self._clf.log_marginal_likelihood_value_ = params['log_marginal_likelihood_value_']
self._clf._y_train_mean = params['_y_train_mean']
self._clf._rng = params['_rng']
self._clf.L_ = params['L_']
self._clf._K_inv = params['_K_inv']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['X_train_'] is not None:
self._fitted = True
if params['y_train_'] is not None:
self._fitted = True
if params['kernel_'] is not None:
self._fitted = True
if params['alpha_'] is not None:
self._fitted = True
if params['log_marginal_likelihood_value_'] is not None:
self._fitted = True
if params['_y_train_mean'] is not None:
self._fitted = True
if params['_rng'] is not None:
self._fitted = True
if params['L_'] is not None:
self._fitted = True
if params['_K_inv'] is not None:
self._fitted = True




@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_inputs_columns'],
exclude_columns=hyperparams['exclude_inputs_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False
@classmethod
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return data, list(data.columns), list(range(len(data.columns)))

metadata = data.metadata

def can_produce_column(column_index: int) -> bool:
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget")
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index))
semantic_types = set(column_metadata.get('semantic_types', []))
if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True
return False

target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata,
use_columns=hyperparams[
'use_outputs_columns'],
exclude_columns=
hyperparams[
'exclude_outputs_columns'],
can_use_column=can_produce_column)
targets = []
if target_column_indices:
targets = data.select_columns(target_column_indices)
target_column_names = []
for idx in target_column_indices:
target_column_names.append(data.columns[idx])
return targets, target_column_names, target_column_indices

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",])
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",])
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=False)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata):
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict()
semantic_types = []
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget')
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name")
if column_name is None:
column_name = "output_{}".format(column_index)
column_metadata["semantic_types"] = semantic_types
column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKGaussianProcessRegressor.__doc__ = GaussianProcessRegressor.__doc__

+ 0
- 344
common-primitives/sklearn-wrap/sklearn_wrap/SKGaussianRandomProjection.py View File

@@ -1,344 +0,0 @@
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple
from numpy import ndarray
from collections import OrderedDict
from scipy import sparse
import os
import sklearn
import numpy
import typing

# Custom import commands if any
from sklearn.random_projection import GaussianRandomProjection


from d3m.container.numpy import ndarray as d3m_ndarray
from d3m.container import DataFrame as d3m_dataframe
from d3m.metadata import hyperparams, params, base as metadata_base
from d3m import utils
from d3m.base import utils as base_utils
from d3m.exceptions import PrimitiveNotFittedError
from d3m.primitive_interfaces.base import CallResult, DockerContainer
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase


Inputs = d3m_dataframe
Outputs = d3m_dataframe


class Params(params.Params):
n_component_: Optional[int]
components_: Optional[Union[ndarray, sparse.spmatrix]]
input_column_names: Optional[Any]
target_names_: Optional[Sequence[Any]]
training_indices_: Optional[Sequence[int]]
target_column_indices_: Optional[Sequence[int]]
target_columns_metadata_: Optional[List[OrderedDict]]



class Hyperparams(hyperparams.Hyperparams):
n_components = hyperparams.Union(
configuration=OrderedDict({
'int': hyperparams.Bounded[int](
lower=0,
upper=None,
default=100,
description='Number of components to keep.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
),
'auto': hyperparams.Constant(
default='auto',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'],
)
}),
default='auto',
description='Dimensionality of the target projection space. n_components can be automatically adjusted according to the number of samples in the dataset and the bound given by the Johnson-Lindenstrauss lemma. In that case the quality of the embedding is controlled by the ``eps`` parameter. It should be noted that Johnson-Lindenstrauss lemma can yield very conservative estimated of the required number of components as it makes no assumption on the structure of the dataset.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
eps = hyperparams.Bounded[float](
default=0.1,
lower=0,
upper=1,
description='Parameter to control the quality of the embedding according to the Johnson-Lindenstrauss lemma when n_components is set to \'auto\'. Smaller values lead to better embedding and higher number of dimensions (n_components) in the target projection space.',
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter']
)
use_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.",
)
exclude_columns = hyperparams.Set(
elements=hyperparams.Hyperparameter[int](-1),
default=(),
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.",
)
return_result = hyperparams.Enumeration(
values=['append', 'replace', 'new'],
default='new',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.",
)
use_semantic_types = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe"
)
add_index_columns = hyperparams.UniformBool(
default=False,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".",
)
error_on_no_input = hyperparams.UniformBool(
default=True,
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'],
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.",
)
return_semantic_type = hyperparams.Enumeration[str](
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'],
default='https://metadata.datadrivendiscovery.org/types/Attribute',
description='Decides what semantic type to attach to generated attributes',
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter']
)

class SKGaussianRandomProjection(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]):
"""
Primitive wrapping for sklearn GaussianRandomProjection
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.random_projection.GaussianRandomProjection.html>`_
"""
__author__ = "JPL MARVIN"
metadata = metadata_base.PrimitiveMetadata({
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_PROJECTION, ],
"name": "sklearn.random_projection.GaussianRandomProjection",
"primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION,
"python_path": "d3m.primitives.data_transformation.gaussian_random_projection.SKlearn",
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.random_projection.GaussianRandomProjection.html']},
"version": "2019.11.13",
"id": "fc933ab9-baaf-47ca-a373-bdd33081f5fa",
"hyperparams_to_tune": ['n_components'],
'installation': [
{'type': metadata_base.PrimitiveInstallationType.PIP,
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format(
git_commit=utils.current_git_commit(os.path.dirname(__file__)),
),
}]
})

def __init__(self, *,
hyperparams: Hyperparams,
random_seed: int = 0,
docker_containers: Dict[str, DockerContainer] = None) -> None:

super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
# False
self._clf = GaussianRandomProjection(
n_components=self.hyperparams['n_components'],
eps=self.hyperparams['eps'],
random_state=self.random_seed,
)
self._inputs = None
self._outputs = None
self._training_inputs = None
self._training_outputs = None
self._target_names = None
self._training_indices = None
self._target_column_indices = None
self._target_columns_metadata: List[OrderedDict] = None
self._input_column_names = None
self._fitted = False
def set_training_data(self, *, inputs: Inputs) -> None:
self._inputs = inputs
self._fitted = False
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]:
if self._fitted:
return CallResult(None)

self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams)
self._input_column_names = self._training_inputs.columns

if self._training_inputs is None:
return CallResult(None)

if len(self._training_indices) > 0:
self._clf.fit(self._training_inputs)
self._fitted = True
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
return CallResult(None)
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]:
if not self._fitted:
raise PrimitiveNotFittedError("Primitive not fitted.")
sk_inputs = inputs
if self.hyperparams['use_semantic_types']:
sk_inputs = inputs.iloc[:, self._training_indices]
output_columns = []
if len(self._training_indices) > 0:
sk_output = self._clf.transform(sk_inputs)
if sparse.issparse(sk_output):
sk_output = sk_output.toarray()
outputs = self._wrap_predictions(inputs, sk_output)
if len(outputs.columns) == len(self._input_column_names):
outputs.columns = self._input_column_names
output_columns = [outputs]
else:
if self.hyperparams['error_on_no_input']:
raise RuntimeError("No input columns were selected")
self.logger.warn("No input columns were selected")
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'],
add_index_columns=self.hyperparams['add_index_columns'],
inputs=inputs, column_indices=self._training_indices,
columns_list=output_columns)
return CallResult(outputs)

def get_params(self) -> Params:
if not self._fitted:
return Params(
n_component_=None,
components_=None,
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

return Params(
n_component_=getattr(self._clf, 'n_component_', None),
components_=getattr(self._clf, 'components_', None),
input_column_names=self._input_column_names,
training_indices_=self._training_indices,
target_names_=self._target_names,
target_column_indices_=self._target_column_indices,
target_columns_metadata_=self._target_columns_metadata
)

def set_params(self, *, params: Params) -> None:
self._clf.n_component_ = params['n_component_']
self._clf.components_ = params['components_']
self._input_column_names = params['input_column_names']
self._training_indices = params['training_indices_']
self._target_names = params['target_names_']
self._target_column_indices = params['target_column_indices_']
self._target_columns_metadata = params['target_columns_metadata_']
if params['n_component_'] is not None:
self._fitted = True
if params['components_'] is not None:
self._fitted = True



@classmethod
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams):
if not hyperparams['use_semantic_types']:
return inputs, list(range(len(inputs.columns)))

inputs_metadata = inputs.metadata

def can_produce_column(column_index: int) -> bool:
return cls._can_produce_column(inputs_metadata, column_index, hyperparams)

columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata,
use_columns=hyperparams['use_columns'],
exclude_columns=hyperparams['exclude_columns'],
can_use_column=can_produce_column)
return inputs.iloc[:, columns_to_produce], columns_to_produce
# return columns_to_produce

@classmethod
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool:
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index))

accepted_structural_types = (int, float, numpy.integer, numpy.float64)
accepted_semantic_types = set()
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute")
if not issubclass(column_metadata['structural_type'], accepted_structural_types):
return False

semantic_types = set(column_metadata.get('semantic_types', []))

if len(semantic_types) == 0:
cls.logger.warning("No semantic types found in column metadata")
return False
# Making sure all accepted_semantic_types are available in semantic_types
if len(accepted_semantic_types - semantic_types) == 0:
return True

return False

@classmethod
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]:
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']

target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_metadata = OrderedDict(outputs_metadata.query_column(column_index))

# Update semantic types and prepare it for predicted targets.
semantic_types = set(column_metadata.get('semantic_types', []))
semantic_types_to_remove = set([])
add_semantic_types = []
add_semantic_types.add(hyperparams["return_semantic_type"])
semantic_types = semantic_types - semantic_types_to_remove
semantic_types = semantic_types.union(add_semantic_types)
column_metadata['semantic_types'] = list(semantic_types)

target_columns_metadata.append(column_metadata)

return target_columns_metadata
@classmethod
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs],
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata:
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs)

for column_index, column_metadata in enumerate(target_columns_metadata):
column_metadata.pop("structural_type", None)
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata)

return outputs_metadata

def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs:
outputs = d3m_dataframe(predictions, generate_metadata=True)
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams)
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata)
return outputs


@classmethod
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams):

outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length']
target_columns_metadata: List[OrderedDict] = []
for column_index in range(outputs_length):
column_name = "output_{}".format(column_index)
column_metadata = OrderedDict()
semantic_types = set()
semantic_types.add(hyperparams["return_semantic_type"])
column_metadata['semantic_types'] = list(semantic_types)

column_metadata["name"] = str(column_name)
target_columns_metadata.append(column_metadata)

return target_columns_metadata


SKGaussianRandomProjection.__doc__ = GaussianRandomProjection.__doc__

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save