Former-commit-id:master082a7acf57
[formerly1f74e655a7
] [formerly69763f4c12
[formerlya893d7685f
]] [formerlycf3b68a678
[formerly99d8f9b778
] [formerly18f539f981
[formerlyef1836f33f
]]] [formerlybd8884495d
[formerly7fbf002dac
] [formerlyf5a45b390b
[formerly5ff75e2e46
]] [formerly1b40e48dd8
[formerlyf323cbe994
] [formerly84c2b485e4
[formerlyda5752974d
]]]] [formerly79eafae538
[formerlyc68cb0cba1
] [formerlyc98bc71cb5
[formerlyf3f012f5fb
]] [formerlyf202a31ab4
[formerly4694d83a52
] [formerlya1f0aaca82
[formerlyeb090d6d3a
]]] [formerly37c4555bd0
[formerly55dfcc63fb
] [formerly302c56c509
[formerlybebdfd98dd
]] [formerlyeed97e12bd
[formerly67f3d10947
] [formerly395df67343
[formerlyc43cf776c6
]]]]] [formerlyc4997cb6a0
[formerlyc4703c17ec
] [formerly7529580984
[formerly9c000b2c79
]] [formerlyb102c6c9b5
[formerlyfd861756c5
] [formerly537f077ec0
[formerlyee573b901f
]]] [formerly105d925a3d
[formerly8cdc22f1d9
] [formerly39e5bc7d5d
[formerly5edf837521
]] [formerly21b0344ffc
[formerly41e32435f7
] [formerlycf73b9408f
[formerly04b883bb8e
]]]] [formerlye732011aec
[formerly163307ea6f
] [formerly38c9e8ff5e
[formerly2c63f40878
]] [formerlydc9037bcb6
[formerly9d15047b5b
] [formerly4da5904375
[formerly577283a827
]]] [formerly238094954c
[formerly99f59e110c
] [formerlya7bba7d99c
[formerlyf7b13e25e8
]] [formerly411b1ea01b
[formerlybaabbad21a
] [formerly869366cb7a
[formerly733cfb4398
]]]]]] Former-commit-id:5ac3e41be2
[formerlyfe85759519
] [formerlybc7c03db32
[formerly9bb2fa5132
]] [formerly3b49901893
[formerly5dd1b25ebc
] [formerlyb7a93df358
[formerly7b3f6e3090
]]] [formerly627cceb26b
[formerlyb0f3ed5f08
] [formerly1672ff9df1
[formerly8eb215a652
]] [formerlyc9a8fc553b
[formerly7a69e6e65c
] [formerly5418882e2d
[formerly19352e6507
]]]] [formerlyce5bc94f1c
[formerly65c63ca77b
] [formerly00098b391f
[formerly4e4b706c5f
]] [formerly16fbe6ff8e
[formerlyad40273329
] [formerlyf113cd10e6
[formerly5b3aa4a777
]]] [formerly7af46bef20
[formerly0fe5b7dbb2
] [formerly712f4701ed
[formerly273adfd4eb
]] [formerly226de99943
[formerly626f787629
] [formerly869366cb7a
]]]] Former-commit-id:912fde5a07
[formerly6c947b866f
] [formerly799d5800c2
[formerlyef77c736ef
]] [formerly811bc3d01f
[formerly341febd7c4
] [formerly13c141596a
[formerly933e5935f1
]]] [formerlyb77ccf87c1
[formerlyf8fc926b80
] [formerly0632053739
[formerly73abb0cf6a
]] [formerlye14b39bc6e
[formerlyff77272b0d
] [formerlye9bbc7aff8
[formerlyb6be52bd79
]]]] Former-commit-id:98f7b4cc65
[formerly3f70f5e2aa
] [formerlyf1df66b510
[formerly36f63495a4
]] [formerly1b39a6508e
[formerlydb3e50f7dd
] [formerlya36fc0aa02
[formerlye4e3039ce7
]]] Former-commit-id:05ff21ba2f
[formerlyb98dd6488a
] [formerlyc4dbfe74c2
[formerly7451b535e9
]] Former-commit-id:51c8386465
[formerly3b85eedffd
] Former-commit-id:90d4dc1597
@@ -1,363 +0,0 @@ | |||
## v0.8.0 | |||
* Removed multi-targets support in `classification.light_gbm.Common` and fixed | |||
categorical attributes handling. | |||
[!118](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/118) | |||
* Unified date parsing across primitives. | |||
Added `raise_error` hyper-parameter to `data_preprocessing.datetime_range_filter.Common`. | |||
This bumped the version of the primitive. | |||
[!117](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/117) | |||
* `evaluation.kfold_time_series_split.Common` now parses the datetime column | |||
before sorting. `fuzzy_time_parsing` hyper-parameter was added to the primitive. | |||
This bumped the version of the primitive. | |||
[!110](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/110) | |||
* Added option `equal` to hyper-parameter `match_logic` of primitive | |||
`data_transformation.extract_columns_by_semantic_types.Common` to support set equality | |||
when determining columns to extract. This bumped the version of the primitive. | |||
[!116](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/116) | |||
* Fixed `data_preprocessing.one_hot_encoder.MakerCommon` to work with the | |||
latest core package. | |||
* `data_cleaning.tabular_extractor.Common` has been fixed to work with the | |||
latest version of sklearn. | |||
[!113](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/113) | |||
* ISI side of `data_augmentation.datamart_augmentation.Common` and | |||
`data_augmentation.datamart_download.Common` has been updated. | |||
[!108](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/108) | |||
* Improved how pipelines and pipeline runs for all primitives are managed. | |||
Many more pipelines and pipeline runs were added. | |||
* `evaluation.kfold_timeseries_split.Common` has been renamed to `evaluation.kfold_time_series_split.Common`. | |||
* Fixed `data_preprocessing.dataset_sample.Common` on empty input. | |||
[!95](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/95) | |||
* `data_preprocessing.datetime_range_filter.Common` does not assume local timezone | |||
when parsing dates. | |||
[#115](https://gitlab.com/datadrivendiscovery/common-primitives/issues/115) | |||
* Added `fuzzy_time_parsing` hyper-parameter to `data_transformation.column_parser.Common`. | |||
This bumped the version of the primitive. | |||
* Fixed `data_transformation.column_parser.Common` to work correctly with `python-dateutil==2.8.1`. | |||
[#119](https://gitlab.com/datadrivendiscovery/common-primitives/issues/119). | |||
* Refactored `data_preprocessing.one_hot_encoder.MakerCommon` to address some issues. | |||
[#66](https://gitlab.com/datadrivendiscovery/common-primitives/issues/66) | |||
[#75](https://gitlab.com/datadrivendiscovery/common-primitives/issues/75) | |||
[!96](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/96) | |||
* Added support for handling of numeric columns to `data_preprocessing.regex_filter.Common` and `data_preprocessing.term_filter.Common`. | |||
[!101](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/101) | |||
[!104](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/104) | |||
* Fixed exception in `produce` method in `data_transformation.datetime_field_compose.Common` caused by using incorrect type for dataframe indexer. | |||
[!102](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/102) | |||
* Added primitives: | |||
* `data_transformation.grouping_field_compose.Common` | |||
## v0.7.0 | |||
* Renamed primitives: | |||
* `data_transformation.add_semantic_types.DataFrameCommon` to `data_transformation.add_semantic_types.Common` | |||
* `data_transformation.remove_semantic_types.DataFrameCommon` to `data_transformation.remove_semantic_types.Common` | |||
* `data_transformation.replace_semantic_types.DataFrameCommon` to `data_transformation.replace_semantic_types.Common` | |||
* `operator.column_map.DataFrameCommon` to `operator.column_map.Common` | |||
* `regression.xgboost_gbtree.DataFrameCommon` to `regression.xgboost_gbtree.Common` | |||
* `classification.light_gbm.DataFrameCommon` to `classification.light_gbm.Common` | |||
* `classification.xgboost_gbtree.DataFrameCommon` to `classification.xgboost_gbtree.Common` | |||
* `classification.xgboost_dart.DataFrameCommon` to `classification.xgboost_dart.Common` | |||
* `classification.random_forest.DataFrameCommon` to `classification.random_forest.Common` | |||
* `data_transformation.extract_columns.DataFrameCommon` to `data_transformation.extract_columns.Common` | |||
* `data_transformation.extract_columns_by_semantic_types.DataFrameCommon` to `data_transformation.extract_columns_by_semantic_types.Common` | |||
* `data_transformation.extract_columns_by_structural_types.DataFrameCommon` to `data_transformation.extract_columns_by_structural_types.Common` | |||
* `data_transformation.cut_audio.DataFrameCommon` to `data_transformation.cut_audio.Common` | |||
* `data_transformation.column_parser.DataFrameCommon` to `data_transformation.column_parser.Common` | |||
* `data_transformation.remove_columns.DataFrameCommon` to `data_transformation.remove_columns.Common` | |||
* `data_transformation.remove_duplicate_columns.DataFrameCommon` to `data_transformation.remove_duplicate_columns.Common` | |||
* `data_transformation.horizontal_concat.DataFrameConcat` to `data_transformation.horizontal_concat.DataFrameCommon` | |||
* `data_transformation.construct_predictions.DataFrameCommon` to `data_transformation.construct_predictions.Common` | |||
* `data_transformation.datetime_field_compose.DataFrameCommon` to `data_transformation.datetime_field_compose.Common` | |||
* `data_preprocessing.label_encoder.DataFrameCommon` to `data_preprocessing.label_encoder.Common` | |||
* `data_preprocessing.label_decoder.DataFrameCommon` to `data_preprocessing.label_decoder.Common` | |||
* `data_preprocessing.image_reader.DataFrameCommon` to `data_preprocessing.image_reader.Common` | |||
* `data_preprocessing.text_reader.DataFrameCommon` to `data_preprocessing.text_reader.Common` | |||
* `data_preprocessing.video_reader.DataFrameCommon` to `data_preprocessing.video_reader.Common` | |||
* `data_preprocessing.csv_reader.DataFrameCommon` to `data_preprocessing.csv_reader.Common` | |||
* `data_preprocessing.audio_reader.DataFrameCommon` to `data_preprocessing.audio_reader.Common` | |||
* `data_preprocessing.regex_filter.DataFrameCommon` to `data_preprocessing.regex_filter.Common` | |||
* `data_preprocessing.term_filter.DataFrameCommon` to `data_preprocessing.term_filter.Common` | |||
* `data_preprocessing.numeric_range_filter.DataFrameCommon` to `data_preprocessing.numeric_range_filter.Common` | |||
* `data_preprocessing.datetime_range_filter.DataFrameCommon` to `data_preprocessing.datetime_range_filter.Common` | |||
## v0.6.0 | |||
* Added `match_logic`, `negate`, and `add_index_columns` hyper-parameters | |||
to `data_transformation.extract_columns_by_structural_types.DataFrameCommon` | |||
and `data_transformation.extract_columns_by_semantic_types.DataFrameCommon` | |||
primitives. | |||
* `feature_extraction.sparse_pca.Common` has been removed and is now available as part of realML. | |||
[!89](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/89) | |||
* Added new primitives: | |||
* `data_preprocessing.datetime_range_filter.DataFrameCommon` | |||
* `data_transformation.datetime_field_compose.DataFrameCommon` | |||
* `d3m.primitives.data_preprocessing.flatten.DataFrameCommon` | |||
* `data_augmentation.datamart_augmentation.Common` | |||
* `data_augmentation.datamart_download.Common` | |||
* `data_preprocessing.dataset_sample.Common` | |||
[#53](https://gitlab.com/datadrivendiscovery/common-primitives/issues/53) | |||
[!86](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/86) | |||
[!87](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/87) | |||
[!85](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/85) | |||
[!63](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/63) | |||
[!92](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/92) | |||
[!93](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/93) | |||
[!81](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/81) | |||
* Fixed `fit` method to return correct value for `operator.column_map.DataFrameCommon`, | |||
`operator.dataset_map.DataFrameCommon`, and `schema_discovery.profiler.Common`. | |||
* Some not maintained primitives have been disabled. If you are using them, consider adopting them. | |||
* `classification.bayesian_logistic_regression.Common` | |||
* `regression.convolutional_neural_net.TorchCommon` | |||
* `operator.diagonal_mvn.Common` | |||
* `regression.feed_forward_neural_net.TorchCommon` | |||
* `data_preprocessing.image_reader.Common` | |||
* `clustering.k_means.Common` | |||
* `regression.linear_regression.Common` | |||
* `regression.loss.TorchCommon` | |||
* `feature_extraction.pca.Common` | |||
* `data_transformation.update_semantic_types.DatasetCommon` has been removed. | |||
Use `data_transformation.add_semantic_types.DataFrameCommon`, | |||
`data_transformation.remove_semantic_types.DataFrameCommon`, | |||
or `data_transformation.replace_semantic_types.DataFrameCommon` together with | |||
`operator.dataset_map.DataFrameCommon` primitive to obtain previous functionality. | |||
[#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83) | |||
* `data_transformation.remove_columns.DatasetCommon` has been removed. | |||
Use `data_transformation.remove_columns.DataFrameCommon` together with | |||
`operator.dataset_map.DataFrameCommon` primitive to obtain previous functionality. | |||
[#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83) | |||
* Some primitives which operate on Dataset have been converted to operate | |||
on DataFrame and renamed. Use them together with `operator.dataset_map.DataFrameCommon` | |||
primitive to obtain previous functionality. | |||
* `data_preprocessing.regex_filter.DatasetCommon` to `data_preprocessing.regex_filter.DataFrameCommon` | |||
* `data_preprocessing.term_filter.DatasetCommon` to `data_preprocessing.term_filter.DataFrameCommon` | |||
* `data_preprocessing.numeric_range_filter.DatasetCommon` to `data_preprocessing.numeric_range_filter.DataFrameCommon` | |||
[#83](https://gitlab.com/datadrivendiscovery/common-primitives/issues/83) | |||
[!84](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/84) | |||
* `schema_discovery.profiler.Common` has been improved: | |||
* More options added to `detect_semantic_types`. | |||
* Added new `remove_unknown_type` hyper-parameter. | |||
## v0.5.0 | |||
* `evaluation.compute_scores.Common` primitive has been moved to the core | |||
package and renamed to `evaluation.compute_scores.Core`. | |||
* `metafeature_extraction.compute_metafeatures.Common` has been renamed to | |||
`metalearning.metafeature_extractor.Common` | |||
* `evaluation.compute_scores.Common` has now a `add_normalized_scores` hyper-parameter | |||
to control adding also a column with normalized scores to the output, which is now | |||
added by default. | |||
* `data_preprocessing.text_reader.DataFrameCommon` primitive has been fixed. | |||
* `data_transformation.rename_duplicate_name.DataFrameCommon` primitive was | |||
fixed to handle all types of column names. | |||
[#73](https://gitlab.com/datadrivendiscovery/common-primitives/issues/73) | |||
[!65](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/65) | |||
* Added new primitives: | |||
* `data_cleaning.tabular_extractor.Common` | |||
* `data_preprocessing.one_hot_encoder.PandasCommon` | |||
* `schema_discovery.profiler.Common` | |||
* `data_transformation.ravel.DataFrameRowCommon` | |||
* `operator.column_map.DataFrameCommon` | |||
* `operator.dataset_map.DataFrameCommon` | |||
* `data_transformation.normalize_column_references.Common` | |||
* `data_transformation.normalize_graphs.Common` | |||
* `feature_extraction.sparse_pca.Common` | |||
* `evaluation.kfold_timeseries_split.Common` | |||
[#57](https://gitlab.com/datadrivendiscovery/common-primitives/issues/57) | |||
[!42](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/42) | |||
[!44](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/44) | |||
[!47](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/47) | |||
[!71](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/71) | |||
[!73](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/73) | |||
[!77](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/77) | |||
[!66](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/66) | |||
[!67](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/67) | |||
* Added hyper-parameter `error_on_no_columns` to `classification.random_forest.DataFrameCommon`. | |||
* Common primitives have been updated to latest changes in d3m core package. | |||
* Many utility functions from `utils.py` have been moved to the d3m core package. | |||
## v0.4.0 | |||
* Renamed `data_preprocessing.one_hot_encoder.Common` to | |||
`data_preprocessing.one_hot_encoder.MakerCommon` and reimplement it. | |||
[!54](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/54) | |||
* Added new primitives: | |||
* `classification.xgboost_gbtree.DataFrameCommon` | |||
* `classification.xgboost_dart.DataFrameCommon` | |||
* `regression.xgboost_gbtree.DataFrameCommon` | |||
* `classification.light_gbm.DataFrameCommon` | |||
* `data_transformation.rename_duplicate_name.DataFrameCommon` | |||
[!45](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/45) | |||
[!46](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/46) | |||
[!49](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/49) | |||
* Made sure `utils.select_columns` works also when given a tuple of columns, and not a list. | |||
[!58](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/58) | |||
* `classification.random_forest.DataFrameCommon` updated so that produced columns have | |||
names matching column names during fitting. Moreover, `produce_feature_importances` | |||
return a `DataFrame` with each column being one feature and having one row with | |||
importances. | |||
[!59](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/59) | |||
* `regression.feed_forward_neural_net.TorchCommon` updated to support | |||
selection of columns using semantic types. | |||
[!57](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/57) | |||
## v0.3.0 | |||
* Made `evaluation.redact_columns.Common` primitive more general so that it can | |||
redact any columns based on their semantic type and not just targets. | |||
* Renamed primitives: | |||
* `datasets.Denormalize` to `data_transformation.denormalize.Common` | |||
* `datasets.DatasetToDataFrame` to `data_transformation.dataset_to_dataframe.Common` | |||
* `evaluation.ComputeScores` to `evaluation.compute_scores.Common` | |||
* `evaluation.RedactTargets` to `evaluation.redact_columns.Common` | |||
* `evaluation.KFoldDatasetSplit` to `evaluation.kfold_dataset_split.Common` | |||
* `evaluation.TrainScoreDatasetSplit` to `evaluation.train_score_dataset_split.Common` | |||
* `evaluation.NoSplitDatasetSplit` to `evaluation.no_split_dataset_split.Common` | |||
* `evaluation.FixedSplitDatasetSplit` to `evaluation.fixed_split_dataset_split.Commmon` | |||
* `classifier.RandomForest` to `classification.random_forest.DataFrameCommon` | |||
* `metadata.ComputeMetafeatures` to `metafeature_extraction.compute_metafeatures.Common` | |||
* `audio.CutAudio` to `data_transformation.cut_audio.DataFrameCommon` | |||
* `data.ListToNDArray` to `data_transformation.list_to_ndarray.Common` | |||
* `data.StackNDArrayColumn` to `data_transformation.stack_ndarray_column.Common` | |||
* `data.AddSemanticTypes` to `data_transformation.add_semantic_types.DataFrameCommon` | |||
* `data.RemoveSemanticTypes` to `data_transformation.remove_semantic_types.DataFrameCommon` | |||
* `data.ConstructPredictions` to `data_transformation.construct_predictions.DataFrameCommon` | |||
* `data.ColumnParser` to `data_transformation.column_parser.DataFrameCommon` | |||
* `data.CastToType` to `data_transformation.cast_to_type.Common` | |||
* `data.ExtractColumns` to `data_transformation.extract_columns.DataFrameCommon` | |||
* `data.ExtractColumnsBySemanticTypes` to `data_transformation.extract_columns_by_semantic_types.DataFrameCommon` | |||
* `data.ExtractColumnsByStructuralTypes` to `data_transformation.extract_columns_by_structural_types.DataFrameCommon` | |||
* `data.RemoveColumns` to `data_transformation.remove_columns.DataFrameCommon` | |||
* `data.RemoveDuplicateColumns` to `data_transformation.remove_duplicate_columns.DataFrameCommon` | |||
* `data.HorizontalConcat` to `data_transformation.horizontal_concat.DataFrameConcat` | |||
* `data.DataFrameToNDArray` to `data_transformation.dataframe_to_ndarray.Common` | |||
* `data.NDArrayToDataFrame` to `data_transformation.ndarray_to_dataframe.Common` | |||
* `data.DataFrameToList` to `data_transformation.dataframe_to_list.Common` | |||
* `data.ListToDataFrame` to `data_transformation.list_to_dataframe.Common` | |||
* `data.NDArrayToList` to `data_transformation.ndarray_to_list.Common` | |||
* `data.ReplaceSemanticTypes` to `data_transformation.replace_semantic_types.DataFrameCommon` | |||
* `data.UnseenLabelEncoder` to `data_preprocessing.label_encoder.DataFrameCommon` | |||
* `data.UnseenLabelDecoder` to `data_preprocessing.label_decoder.DataFrameCommon` | |||
* `data.ImageReader` to `data_preprocessing.image_reader.DataFrameCommon` | |||
* `data.TextReader` to `data_preprocessing.text_reader.DataFrameCommon` | |||
* `data.VideoReader` to `data_preprocessing.video_reader.DataFrameCommon` | |||
* `data.CSVReader` to `data_preprocessing.csv_reader.DataFrameCommon` | |||
* `data.AudioReader` to `data_preprocessing.audio_reader.DataFrameCommon` | |||
* `datasets.UpdateSemanticTypes` to `data_transformation.update_semantic_types.DatasetCommon` | |||
* `datasets.RemoveColumns` to `data_transformation.remove_columns.DatasetCommon` | |||
* `datasets.RegexFilter` to `data_preprocessing.regex_filter.DatasetCommon` | |||
* `datasets.TermFilter` to `data_preprocessing.term_filter.DatasetCommon` | |||
* `datasets.NumericRangeFilter` to `data_preprocessing.numeric_range_filter.DatasetCommon` | |||
* `common_primitives.BayesianLogisticRegression` to `classification.bayesian_logistic_regression.Common` | |||
* `common_primitives.ConvolutionalNeuralNet` to `regression.convolutional_neural_net.TorchCommon` | |||
* `common_primitives.DiagonalMVN` to `operator.diagonal_mvn.Common` | |||
* `common_primitives.FeedForwardNeuralNet` to `regression.feed_forward_neural_net.TorchCommon` | |||
* `common_primitives.ImageReader` to `data_preprocessing.image_reader.Common` | |||
* `common_primitives.KMeans` to `clustering.kmeans.Common` | |||
* `common_primitives.LinearRegression` to `regression.linear_regression.Common` | |||
* `common_primitives.Loss` to `regression.loss.TorchCommon` | |||
* `common_primitives.PCA` to `feature_extraction.pca.Common` | |||
* `common_primitives.OneHotMaker` to `data_preprocessing.one_hot_encoder.Common` | |||
* Fixed pickling issue of `classifier.RandomFores`. | |||
[#47](https://gitlab.com/datadrivendiscovery/common-primitives/issues/47) | |||
[!48](https://gitlab.com/datadrivendiscovery/common-primitives/merge_requests/48) | |||
* `data.ColumnParser` primitive has now additional hyper-parameter `replace_index_columns` | |||
which controls whether index columns are still replaced when otherwise appending returned | |||
parsed columns or not. | |||
* Made `data.RemoveDuplicateColumns` fit and remember duplicate columns during training. | |||
[#45](https://gitlab.com/datadrivendiscovery/common-primitives/issues/45) | |||
* Added `match_logic` hyper-parameter to the `data.ReplaceSemanticTypes` primitive | |||
which allows one to control how multiple specified semantic types match. | |||
* Added new primitives: | |||
* `metadata.ComputeMetafeatures` | |||
* `datasets.RegexFilter` | |||
* `datasets.TermFilter` | |||
* `datasets.NumericRangeFilter` | |||
* `evaluation.NoSplitDatasetSplit` | |||
* `evaluation.FixedSplitDatasetSplit` | |||
* Column parser fixed to parse columns with `http://schema.org/DateTime` semantic type. | |||
* Simplified logic (and made it more predictable) of `combine_columns` utility function when | |||
using `new` `return_result` and `add_index_columns` set to true. Now if output already contains | |||
any index column, input index columns are not added. And if there are no index columns, | |||
all input index columns are added at the beginning. | |||
* Fixed `_can_use_inputs_column` in `classifier.RandomForest`. Added check of structural type, so | |||
only columns with numerical structural types are processed. | |||
* Correctly set column names in `evaluation.ComputeScores` primitive's output. | |||
* Cast indices and columns to match predicted columns' dtypes. | |||
[#33](https://gitlab.com/datadrivendiscovery/common-primitives/issues/33) | |||
* `datasets.DatasetToDataFrame` primitive does not try to generate metadata automatically | |||
because this is not really needed (metadata can just be copied from the dataset). This | |||
speeds up the primitive. | |||
[#34](https://gitlab.com/datadrivendiscovery/common-primitives/issues/34) | |||
* Made it uniform that whenever we are generating lists of all column names | |||
we try first to get the name from the metadata and fallback to one in DataFrame. | |||
Instead of using a column index in the latter case. | |||
* Made splitting primitives, `classifier.RandomForest` and `data.UnseenLabelEncoder` | |||
be picklable even unfitted. | |||
* Fixed entry point for `audio.CutAudio` primitive. | |||
## v0.2.0 | |||
* Made those primitives operate on semantic types and support different ways to return results. | |||
* Added or updated many primitives: | |||
* `data.ExtractColumns` | |||
* `data.ExtractColumnsBySemanticTypes` | |||
* `data.ExtractColumnsByStructuralTypes` | |||
* `data.RemoveColumns` | |||
* `data.RemoveDuplicateColumns` | |||
* `data.HorizontalConcat` | |||
* `data.CastToType` | |||
* `data.ColumnParser` | |||
* `data.ConstructPredictions` | |||
* `data.DataFrameToNDArray` | |||
* `data.NDArrayToDataFrame` | |||
* `data.DataFrameToList` | |||
* `data.ListToDataFrame` | |||
* `data.NDArrayToList` | |||
* `data.ListToNDArray` | |||
* `data.StackNDArrayColumn` | |||
* `data.AddSemanticTypes` | |||
* `data.RemoveSemanticTypes` | |||
* `data.ReplaceSemanticTypes` | |||
* `data.UnseenLabelEncoder` | |||
* `data.UnseenLabelDecoder` | |||
* `data.ImageReader` | |||
* `data.TextReader` | |||
* `data.VideoReader` | |||
* `data.CSVReader` | |||
* `data.AudioReader` | |||
* `datasets.Denormalize` | |||
* `datasets.DatasetToDataFrame` | |||
* `datasets.UpdateSemanticTypes` | |||
* `datasets.RemoveColumns` | |||
* `evaluation.RedactTargets` | |||
* `evaluation.ComputeScores` | |||
* `evaluation.KFoldDatasetSplit` | |||
* `evaluation.TrainScoreDatasetSplit` | |||
* `audio.CutAudio` | |||
* `classifier.RandomForest` | |||
* Starting list enabled primitives in the [`entry_points.ini`](./entry_points.ini) file. | |||
* Created `devel` branch which contains primitives coded against the | |||
future release of the `d3m` core package (its `devel` branch). | |||
`master` branch of this repository is made against the latest stable | |||
release of the `d3m` core package. | |||
* Dropped support for Python 2.7 and require Python 3.6. | |||
* Renamed repository and package to `common-primitives` and `common_primitives`, | |||
respectively. | |||
* Repository migrated to gitlab.com and made public. | |||
## v0.1.1 | |||
* Made common primitives work on Python 2.7. | |||
## v0.1.0 | |||
* Initial set of common primitives. |
@@ -1,94 +0,0 @@ | |||
# How to publish primitive annotations | |||
As contributors add or update their primitives they might want to publish | |||
primitive annotations for added primitives. When doing this it is important | |||
to republish also all other primitive annotations already published from this | |||
package. This is because only one version of the package can be installed at | |||
a time and all primitive annotations have to point to the same package in | |||
their `installation` metadata. | |||
Steps to publish primitive annotations: | |||
* Operate in a virtual env with the following installed: | |||
* Target core package installed. | |||
* [Test primitives](https://gitlab.com/datadrivendiscovery/tests-data/tree/master/primitives) | |||
with the same version of primitives which are currently published in `primitives` | |||
repository. Remember to install them in `-e` editable mode. | |||
* Update `HISTORY.md` for `vNEXT` release with information about primitives | |||
added or updated. If there was no package release since they were updated last, | |||
do not duplicate entries but just update any existing entries for those primitives | |||
instead, so that once released it is clear what has changed in a release as a whole. | |||
* Make sure tests for primitives being published (primitives added, updated, | |||
and primitives previously published which should be now republished) pass. | |||
* Update `entry_points.ini` and add new primitives. Leave active | |||
only those entries for primitives being (re)published and comment out all others. | |||
* If this is the first time primitives are published after a release of a new `d3m` | |||
core package, leave active only those which were updated to work with | |||
the new `d3m` core package. Leave to others to update, verify, and publish | |||
other common primitives. | |||
* In clone of `primitives` repository prepare a branch of the up-to-date `master` branch | |||
to add/update primitive annotations. If existing annotations for common primitives | |||
are already there the best is to first remove them to make sure annotations for | |||
removed primitives do not stay around. We will re-add all primitives in the next step. | |||
* Run `add.sh` in root of this package, which will add primitive annotations | |||
to `primitives`. See instructions in the script for more information. | |||
* Verify changes in the `primitives`, add and commit files to git. | |||
* Publish a branch in `primitives` and make a merge request. | |||
# How to release a new version | |||
A new version is always released from `master` branch against a stable release | |||
of `d3m` core package. A new version should be released when there are major | |||
changes to the package (many new primitives added, larger breaking changes). | |||
Sync up with other developers of the repo to suggest a release, or do a release. | |||
* On `master` branch: | |||
* Make sure `HISTORY.md` file is updated with all changes since the last release. | |||
* Change a version in `common_primitives/__init__.py` to the to-be-released version, without `v` prefix. | |||
* Change `vNEXT` in `HISTORY.md` to the to-be-released version, with `v` prefix. | |||
* Commit with message `Bumping version for release.` | |||
* `git push` | |||
* Wait for CI to run tests successfully. | |||
* Tag with version prefixed with `v`, e.g., for version `0.2.0`: `git tag v0.2.0` | |||
* `git push` & `git push --tags` | |||
* Change a version in `common_primitives/__init__.py` back to `devel` string. | |||
* Add a new empty `vNEXT` version on top of `HISTORY.md`. | |||
* Commit with message `Version bump for development.` | |||
* `git push` | |||
* On `devel` branch: | |||
* Merge `master` into `devel` branch: `git merge master` | |||
* Update the branch according to the section below. | |||
* `git push` | |||
# How to update `master` branch after a release of new `d3m` core package | |||
Hopefully, `devel` branch already contains code which works against the released | |||
`d3m` core package. So merge `devel` branch into `master` branch and update | |||
files according to the following section. | |||
# Keeping `master` and `devel` branches in sync | |||
Because `master` and `devel` branches mostly contain the same code, | |||
just made against different version of `d3m` core package, it is common | |||
to merge branches into each other as needed to keep them in sync. | |||
When doing so, the following are files which are specific to branches: | |||
* `.gitlab-ci.yml` has a `DEPENDENCY_REF` environment variable which | |||
has to point to `master` on `master` branch of this repository, | |||
and `devel` on `devel` branch of this repository. | |||
# How to add an example pipeline | |||
Every common primitive (except those used in non-standard pipelines, like splitting primitives) | |||
should have at least one example pipeline and associated pipeline run. | |||
Add example pipelines into a corresponding sub-directory based on primitive's suffix into `pipelines` | |||
directory in the repository. If a pipeline uses multiple common primitives, add it for only one | |||
primitive and create symbolic links for other primitives. | |||
Create a `fit-score` pipeline run as [described in primitives index repository](https://gitlab.com/datadrivendiscovery/primitives#adding-a-primitive). | |||
Compress it with `gzip` and store it under `pipeline_runs` directory in the repository. | |||
Similarly, add it only for one primitive and create symbolic links for others, if pipeline run | |||
corresponds to a pipeline with multiple common primitives. | |||
Use `git-add.sh` script to assure all files larger than 100 KB are added as git LFS files to | |||
the repository. |
@@ -1,201 +0,0 @@ | |||
Apache License | |||
Version 2.0, January 2004 | |||
http://www.apache.org/licenses/ | |||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION | |||
1. Definitions. | |||
"License" shall mean the terms and conditions for use, reproduction, | |||
and distribution as defined by Sections 1 through 9 of this document. | |||
"Licensor" shall mean the copyright owner or entity authorized by | |||
the copyright owner that is granting the License. | |||
"Legal Entity" shall mean the union of the acting entity and all | |||
other entities that control, are controlled by, or are under common | |||
control with that entity. For the purposes of this definition, | |||
"control" means (i) the power, direct or indirect, to cause the | |||
direction or management of such entity, whether by contract or | |||
otherwise, or (ii) ownership of fifty percent (50%) or more of the | |||
outstanding shares, or (iii) beneficial ownership of such entity. | |||
"You" (or "Your") shall mean an individual or Legal Entity | |||
exercising permissions granted by this License. | |||
"Source" form shall mean the preferred form for making modifications, | |||
including but not limited to software source code, documentation | |||
source, and configuration files. | |||
"Object" form shall mean any form resulting from mechanical | |||
transformation or translation of a Source form, including but | |||
not limited to compiled object code, generated documentation, | |||
and conversions to other media types. | |||
"Work" shall mean the work of authorship, whether in Source or | |||
Object form, made available under the License, as indicated by a | |||
copyright notice that is included in or attached to the work | |||
(an example is provided in the Appendix below). | |||
"Derivative Works" shall mean any work, whether in Source or Object | |||
form, that is based on (or derived from) the Work and for which the | |||
editorial revisions, annotations, elaborations, or other modifications | |||
represent, as a whole, an original work of authorship. For the purposes | |||
of this License, Derivative Works shall not include works that remain | |||
separable from, or merely link (or bind by name) to the interfaces of, | |||
the Work and Derivative Works thereof. | |||
"Contribution" shall mean any work of authorship, including | |||
the original version of the Work and any modifications or additions | |||
to that Work or Derivative Works thereof, that is intentionally | |||
submitted to Licensor for inclusion in the Work by the copyright owner | |||
or by an individual or Legal Entity authorized to submit on behalf of | |||
the copyright owner. For the purposes of this definition, "submitted" | |||
means any form of electronic, verbal, or written communication sent | |||
to the Licensor or its representatives, including but not limited to | |||
communication on electronic mailing lists, source code control systems, | |||
and issue tracking systems that are managed by, or on behalf of, the | |||
Licensor for the purpose of discussing and improving the Work, but | |||
excluding communication that is conspicuously marked or otherwise | |||
designated in writing by the copyright owner as "Not a Contribution." | |||
"Contributor" shall mean Licensor and any individual or Legal Entity | |||
on behalf of whom a Contribution has been received by Licensor and | |||
subsequently incorporated within the Work. | |||
2. Grant of Copyright License. Subject to the terms and conditions of | |||
this License, each Contributor hereby grants to You a perpetual, | |||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||
copyright license to reproduce, prepare Derivative Works of, | |||
publicly display, publicly perform, sublicense, and distribute the | |||
Work and such Derivative Works in Source or Object form. | |||
3. Grant of Patent License. Subject to the terms and conditions of | |||
this License, each Contributor hereby grants to You a perpetual, | |||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable | |||
(except as stated in this section) patent license to make, have made, | |||
use, offer to sell, sell, import, and otherwise transfer the Work, | |||
where such license applies only to those patent claims licensable | |||
by such Contributor that are necessarily infringed by their | |||
Contribution(s) alone or by combination of their Contribution(s) | |||
with the Work to which such Contribution(s) was submitted. If You | |||
institute patent litigation against any entity (including a | |||
cross-claim or counterclaim in a lawsuit) alleging that the Work | |||
or a Contribution incorporated within the Work constitutes direct | |||
or contributory patent infringement, then any patent licenses | |||
granted to You under this License for that Work shall terminate | |||
as of the date such litigation is filed. | |||
4. Redistribution. You may reproduce and distribute copies of the | |||
Work or Derivative Works thereof in any medium, with or without | |||
modifications, and in Source or Object form, provided that You | |||
meet the following conditions: | |||
(a) You must give any other recipients of the Work or | |||
Derivative Works a copy of this License; and | |||
(b) You must cause any modified files to carry prominent notices | |||
stating that You changed the files; and | |||
(c) You must retain, in the Source form of any Derivative Works | |||
that You distribute, all copyright, patent, trademark, and | |||
attribution notices from the Source form of the Work, | |||
excluding those notices that do not pertain to any part of | |||
the Derivative Works; and | |||
(d) If the Work includes a "NOTICE" text file as part of its | |||
distribution, then any Derivative Works that You distribute must | |||
include a readable copy of the attribution notices contained | |||
within such NOTICE file, excluding those notices that do not | |||
pertain to any part of the Derivative Works, in at least one | |||
of the following places: within a NOTICE text file distributed | |||
as part of the Derivative Works; within the Source form or | |||
documentation, if provided along with the Derivative Works; or, | |||
within a display generated by the Derivative Works, if and | |||
wherever such third-party notices normally appear. The contents | |||
of the NOTICE file are for informational purposes only and | |||
do not modify the License. You may add Your own attribution | |||
notices within Derivative Works that You distribute, alongside | |||
or as an addendum to the NOTICE text from the Work, provided | |||
that such additional attribution notices cannot be construed | |||
as modifying the License. | |||
You may add Your own copyright statement to Your modifications and | |||
may provide additional or different license terms and conditions | |||
for use, reproduction, or distribution of Your modifications, or | |||
for any such Derivative Works as a whole, provided Your use, | |||
reproduction, and distribution of the Work otherwise complies with | |||
the conditions stated in this License. | |||
5. Submission of Contributions. Unless You explicitly state otherwise, | |||
any Contribution intentionally submitted for inclusion in the Work | |||
by You to the Licensor shall be under the terms and conditions of | |||
this License, without any additional terms or conditions. | |||
Notwithstanding the above, nothing herein shall supersede or modify | |||
the terms of any separate license agreement you may have executed | |||
with Licensor regarding such Contributions. | |||
6. Trademarks. This License does not grant permission to use the trade | |||
names, trademarks, service marks, or product names of the Licensor, | |||
except as required for reasonable and customary use in describing the | |||
origin of the Work and reproducing the content of the NOTICE file. | |||
7. Disclaimer of Warranty. Unless required by applicable law or | |||
agreed to in writing, Licensor provides the Work (and each | |||
Contributor provides its Contributions) on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
implied, including, without limitation, any warranties or conditions | |||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A | |||
PARTICULAR PURPOSE. You are solely responsible for determining the | |||
appropriateness of using or redistributing the Work and assume any | |||
risks associated with Your exercise of permissions under this License. | |||
8. Limitation of Liability. In no event and under no legal theory, | |||
whether in tort (including negligence), contract, or otherwise, | |||
unless required by applicable law (such as deliberate and grossly | |||
negligent acts) or agreed to in writing, shall any Contributor be | |||
liable to You for damages, including any direct, indirect, special, | |||
incidental, or consequential damages of any character arising as a | |||
result of this License or out of the use or inability to use the | |||
Work (including but not limited to damages for loss of goodwill, | |||
work stoppage, computer failure or malfunction, or any and all | |||
other commercial damages or losses), even if such Contributor | |||
has been advised of the possibility of such damages. | |||
9. Accepting Warranty or Additional Liability. While redistributing | |||
the Work or Derivative Works thereof, You may choose to offer, | |||
and charge a fee for, acceptance of support, warranty, indemnity, | |||
or other liability obligations and/or rights consistent with this | |||
License. However, in accepting such obligations, You may act only | |||
on Your own behalf and on Your sole responsibility, not on behalf | |||
of any other Contributor, and only if You agree to indemnify, | |||
defend, and hold each Contributor harmless for any liability | |||
incurred by, or claims asserted against, such Contributor by reason | |||
of your accepting any such warranty or additional liability. | |||
END OF TERMS AND CONDITIONS | |||
APPENDIX: How to apply the Apache License to your work. | |||
To apply the Apache License to your work, attach the following | |||
boilerplate notice, with the fields enclosed by brackets "[]" | |||
replaced with your own identifying information. (Don't include | |||
the brackets!) The text should be enclosed in the appropriate | |||
comment syntax for the file format. We also recommend that a | |||
file or class name and description of purpose be included on the | |||
same "printed page" as the copyright notice for easier | |||
identification within third-party archives. | |||
Copyright [yyyy] [name of copyright owner] | |||
Licensed under the Apache License, Version 2.0 (the "License"); | |||
you may not use this file except in compliance with the License. | |||
You may obtain a copy of the License at | |||
http://www.apache.org/licenses/LICENSE-2.0 | |||
Unless required by applicable law or agreed to in writing, software | |||
distributed under the License is distributed on an "AS IS" BASIS, | |||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
See the License for the specific language governing permissions and | |||
limitations under the License. |
@@ -1,2 +0,0 @@ | |||
include README.md | |||
include LICENSE.txt |
@@ -1,83 +0,0 @@ | |||
# Common D3M primitives | |||
A common set of primitives for D3M project, maintained together. | |||
It contains example primitives, various glue primitives, and other primitives performers | |||
contributed. | |||
## Installation | |||
This package works on Python 3.6+ and pip 19+. | |||
This package additional dependencies which are specified in primitives' metadata, | |||
but if you are manually installing the package, you have to first run, for Ubuntu: | |||
``` | |||
$ apt-get install build-essential libopenblas-dev libcap-dev ffmpeg | |||
$ pip3 install python-prctl | |||
``` | |||
To install common primitives from inside a cloned repository, run: | |||
``` | |||
$ pip3 install -e . | |||
``` | |||
When cloning a repository, clone it recursively to get also git submodules: | |||
``` | |||
$ git clone --recursive https://gitlab.com/datadrivendiscovery/common-primitives.git | |||
``` | |||
## Changelog | |||
See [HISTORY.md](./HISTORY.md) for summary of changes to this package. | |||
## Repository structure | |||
`master` branch contains latest code of common primitives made against the latest stable | |||
release of the [`d3m` core package](https://gitlab.com/datadrivendiscovery/d3m) (its `master` branch). | |||
`devel` branch contains latest code of common primitives made against the | |||
future release of the `d3m` core package (its `devel` branch). | |||
Releases are [tagged](https://gitlab.com/datadrivendiscovery/d3m/tags) but they are not done | |||
regularly. Each primitive has its own versions as well, which are not related to package versions. | |||
Generally is the best to just use the latest code available in `master` or `devel` | |||
branches (depending which version of the core package you are using). | |||
## Testing locally | |||
For each commit to this repository, tests run automatically in the | |||
[GitLab CI](https://gitlab.com/datadrivendiscovery/common-primitives/pipelines). | |||
If you don't want to wait for the GitLab CI test results and run the tests locally, | |||
you can install and use the [GitLab runner](https://docs.gitlab.com/runner/install/) in your system. | |||
With the local GitLab runner, you can run the tests defined in the [.gitlab-ci.yml](.gitlab-ci.yml) | |||
file of this repository, such as: | |||
``` | |||
$ gitlab-runner exec docker style_check | |||
$ gitlab-runner exec docker type_check | |||
``` | |||
You can also just try to run tests available under `/tests` by running: | |||
``` | |||
$ python3 run_tests.py | |||
``` | |||
## Contribute | |||
Feel free to contribute more primitives to this repository. The idea is that we build | |||
a common set of primitives which can help both as an example, but also to have shared | |||
maintenance of some primitives, especially glue primitives. | |||
All primitives are written in Python 3 and are type checked using | |||
[mypy](http://www.mypy-lang.org/), so typing annotations are required. | |||
## About Data Driven Discovery Program | |||
DARPA Data Driven Discovery (D3M) Program is researching ways to get machines to build | |||
machine learning pipelines automatically. It is split into three layers: | |||
TA1 (primitives), TA2 (systems which combine primitives automatically into pipelines | |||
and executes them), and TA3 (end-users interfaces). |
@@ -1,24 +0,0 @@ | |||
#!/bin/bash -e | |||
# Assumption is that this repository is cloned into "common-primitives" directory | |||
# which is a sibling of "d3m-primitives" directory with D3M public primitives. | |||
D3M_VERSION="$(python3 -c 'import d3m; print(d3m.__version__)')" | |||
for PRIMITIVE_SUFFIX in $(./list_primitives.py --suffix); do | |||
echo "$PRIMITIVE_SUFFIX" | |||
python3 -m d3m index describe -i 4 "d3m.primitives.$PRIMITIVE_SUFFIX" > primitive.json | |||
pushd ../d3m-primitives > /dev/null | |||
./add.py ../common-primitives/primitive.json | |||
popd > /dev/null | |||
if [[ -e "pipelines/$PRIMITIVE_SUFFIX" ]]; then | |||
PRIMITIVE_PATH="$(echo ../d3m-primitives/v$D3M_VERSION/common-primitives/d3m.primitives.$PRIMITIVE_SUFFIX/*)" | |||
mkdir -p "$PRIMITIVE_PATH/pipelines" | |||
find pipelines/$PRIMITIVE_SUFFIX/ \( -name '*.json' -or -name '*.yaml' -or -name '*.yml' -or -name '*.json.gz' -or -name '*.yaml.gz' -or -name '*.yml.gz' \) -exec cp '{}' "$PRIMITIVE_PATH/pipelines" ';' | |||
fi | |||
if [[ -e "pipeline_runs/$PRIMITIVE_SUFFIX" ]]; then | |||
PRIMITIVE_PATH="$(echo ../d3m-primitives/v$D3M_VERSION/common-primitives/d3m.primitives.$PRIMITIVE_SUFFIX/*)" | |||
mkdir -p "$PRIMITIVE_PATH/pipeline_runs" | |||
find pipeline_runs/$PRIMITIVE_SUFFIX/ \( -name '*.yml.gz' -or -name '*.yaml.gz' \) -exec cp '{}' "$PRIMITIVE_PATH/pipeline_runs" ';' | |||
fi | |||
done |
@@ -1,63 +0,0 @@ | |||
[d3m.primitives] | |||
data_preprocessing.one_hot_encoder.MakerCommon = common_primitives.one_hot_maker:OneHotMakerPrimitive | |||
data_preprocessing.one_hot_encoder.PandasCommon = common_primitives.pandas_onehot_encoder:PandasOneHotEncoderPrimitive | |||
data_transformation.extract_columns.Common = common_primitives.extract_columns:ExtractColumnsPrimitive | |||
data_transformation.extract_columns_by_semantic_types.Common = common_primitives.extract_columns_semantic_types:ExtractColumnsBySemanticTypesPrimitive | |||
data_transformation.extract_columns_by_structural_types.Common = common_primitives.extract_columns_structural_types:ExtractColumnsByStructuralTypesPrimitive | |||
data_transformation.remove_columns.Common = common_primitives.remove_columns:RemoveColumnsPrimitive | |||
data_transformation.remove_duplicate_columns.Common = common_primitives.remove_duplicate_columns:RemoveDuplicateColumnsPrimitive | |||
data_transformation.horizontal_concat.DataFrameCommon = common_primitives.horizontal_concat:HorizontalConcatPrimitive | |||
data_transformation.cast_to_type.Common = common_primitives.cast_to_type:CastToTypePrimitive | |||
data_transformation.column_parser.Common = common_primitives.column_parser:ColumnParserPrimitive | |||
data_transformation.construct_predictions.Common = common_primitives.construct_predictions:ConstructPredictionsPrimitive | |||
data_transformation.dataframe_to_ndarray.Common = common_primitives.dataframe_to_ndarray:DataFrameToNDArrayPrimitive | |||
data_transformation.ndarray_to_dataframe.Common = common_primitives.ndarray_to_dataframe:NDArrayToDataFramePrimitive | |||
data_transformation.dataframe_to_list.Common = common_primitives.dataframe_to_list:DataFrameToListPrimitive | |||
data_transformation.list_to_dataframe.Common = common_primitives.list_to_dataframe:ListToDataFramePrimitive | |||
data_transformation.ndarray_to_list.Common = common_primitives.ndarray_to_list:NDArrayToListPrimitive | |||
data_transformation.list_to_ndarray.Common = common_primitives.list_to_ndarray:ListToNDArrayPrimitive | |||
data_transformation.stack_ndarray_column.Common = common_primitives.stack_ndarray_column:StackNDArrayColumnPrimitive | |||
data_transformation.add_semantic_types.Common = common_primitives.add_semantic_types:AddSemanticTypesPrimitive | |||
data_transformation.remove_semantic_types.Common = common_primitives.remove_semantic_types:RemoveSemanticTypesPrimitive | |||
data_transformation.replace_semantic_types.Common = common_primitives.replace_semantic_types:ReplaceSemanticTypesPrimitive | |||
data_transformation.denormalize.Common = common_primitives.denormalize:DenormalizePrimitive | |||
data_transformation.datetime_field_compose.Common = common_primitives.datetime_field_compose:DatetimeFieldComposePrimitive | |||
data_transformation.grouping_field_compose.Common = common_primitives.grouping_field_compose:GroupingFieldComposePrimitive | |||
data_transformation.dataset_to_dataframe.Common = common_primitives.dataset_to_dataframe:DatasetToDataFramePrimitive | |||
data_transformation.cut_audio.Common = common_primitives.cut_audio:CutAudioPrimitive | |||
data_transformation.rename_duplicate_name.DataFrameCommon = common_primitives.rename_duplicate_columns:RenameDuplicateColumnsPrimitive | |||
#data_transformation.normalize_column_references.Common = common_primitives.normalize_column_references:NormalizeColumnReferencesPrimitive | |||
#data_transformation.normalize_graphs.Common = common_primitives.normalize_graphs:NormalizeGraphsPrimitive | |||
data_transformation.ravel.DataFrameRowCommon = common_primitives.ravel:RavelAsRowPrimitive | |||
data_preprocessing.label_encoder.Common = common_primitives.unseen_label_encoder:UnseenLabelEncoderPrimitive | |||
data_preprocessing.label_decoder.Common = common_primitives.unseen_label_decoder:UnseenLabelDecoderPrimitive | |||
data_preprocessing.image_reader.Common = common_primitives.dataframe_image_reader:DataFrameImageReaderPrimitive | |||
data_preprocessing.text_reader.Common = common_primitives.text_reader:TextReaderPrimitive | |||
data_preprocessing.video_reader.Common = common_primitives.video_reader:VideoReaderPrimitive | |||
data_preprocessing.csv_reader.Common = common_primitives.csv_reader:CSVReaderPrimitive | |||
data_preprocessing.audio_reader.Common = common_primitives.audio_reader:AudioReaderPrimitive | |||
data_preprocessing.regex_filter.Common = common_primitives.regex_filter:RegexFilterPrimitive | |||
data_preprocessing.term_filter.Common = common_primitives.term_filter:TermFilterPrimitive | |||
data_preprocessing.numeric_range_filter.Common = common_primitives.numeric_range_filter:NumericRangeFilterPrimitive | |||
data_preprocessing.datetime_range_filter.Common = common_primitives.datetime_range_filter:DatetimeRangeFilterPrimitive | |||
data_preprocessing.dataset_sample.Common = common_primitives.dataset_sample:DatasetSamplePrimitive | |||
#data_preprocessing.time_interval_transform.Common = common_primitives.time_interval_transform:TimeIntervalTransformPrimitive | |||
data_cleaning.tabular_extractor.Common = common_primitives.tabular_extractor:AnnotatedTabularExtractorPrimitive | |||
evaluation.redact_columns.Common = common_primitives.redact_columns:RedactColumnsPrimitive | |||
evaluation.kfold_dataset_split.Common = common_primitives.kfold_split:KFoldDatasetSplitPrimitive | |||
evaluation.kfold_time_series_split.Common = common_primitives.kfold_split_timeseries:KFoldTimeSeriesSplitPrimitive | |||
evaluation.train_score_dataset_split.Common = common_primitives.train_score_split:TrainScoreDatasetSplitPrimitive | |||
evaluation.no_split_dataset_split.Common = common_primitives.no_split:NoSplitDatasetSplitPrimitive | |||
evaluation.fixed_split_dataset_split.Commmon = common_primitives.fixed_split:FixedSplitDatasetSplitPrimitive | |||
classification.random_forest.Common = common_primitives.random_forest:RandomForestClassifierPrimitive | |||
classification.light_gbm.Common = common_primitives.lgbm_classifier:LightGBMClassifierPrimitive | |||
classification.xgboost_gbtree.Common = common_primitives.xgboost_gbtree:XGBoostGBTreeClassifierPrimitive | |||
classification.xgboost_dart.Common = common_primitives.xgboost_dart:XGBoostDartClassifierPrimitive | |||
regression.xgboost_gbtree.Common = common_primitives.xgboost_regressor:XGBoostGBTreeRegressorPrimitive | |||
schema_discovery.profiler.Common = common_primitives.simple_profiler:SimpleProfilerPrimitive | |||
operator.column_map.Common = common_primitives.column_map:DataFrameColumnMapPrimitive | |||
operator.dataset_map.DataFrameCommon = common_primitives.dataset_map:DataFrameDatasetMapPrimitive | |||
data_preprocessing.flatten.DataFrameCommon = common_primitives.dataframe_flatten:DataFrameFlattenPrimitive | |||
metalearning.metafeature_extractor.Common = common_primitives.compute_metafeatures:ComputeMetafeaturesPrimitive | |||
data_augmentation.datamart_augmentation.Common = common_primitives.datamart_augment:DataMartAugmentPrimitive | |||
data_augmentation.datamart_download.Common = common_primitives.datamart_download:DataMartDownloadPrimitive |
@@ -1,5 +0,0 @@ | |||
#!/bin/bash -e | |||
# This requires git LFS 2.9.0 or newer. | |||
find * -type f -size +100k -exec git lfs track --filename '{}' + |
@@ -1,21 +0,0 @@ | |||
#!/bin/bash -e | |||
if git rev-list --objects --all \ | |||
| git cat-file --batch-check='%(objecttype) %(objectname) %(objectsize) %(rest)' \ | |||
| sed -n 's/^blob //p' \ | |||
| awk '$2 >= 100*(2^10)' \ | |||
| awk '{print $3}' \ | |||
| egrep -v '(^|/).gitattributes$' ; then | |||
echo "Repository contains committed objects larger than 100 KB." | |||
exit 1 | |||
fi | |||
if git lfs ls-files --name-only | xargs -r stat -c '%s %n' | awk '$1 < 100*(2^10)' | awk '{print $2}' | grep . ; then | |||
echo "Repository contains LFS objects smaller than 100 KB." | |||
exit 1 | |||
fi | |||
if git lfs ls-files --name-only | xargs -r stat -c '%s %n' | awk '$1 >= 2*(2^30)' | awk '{print $2}' | grep . ; then | |||
echo "Repository contains LFS objects not smaller than 2 GB." | |||
exit 1 | |||
fi |
@@ -1,32 +0,0 @@ | |||
#!/usr/bin/env python3 | |||
import argparse | |||
import configparser | |||
import re | |||
class CaseSensitiveConfigParser(configparser.ConfigParser): | |||
optionxform = staticmethod(str) | |||
parser = argparse.ArgumentParser(description='List enabled common primitives.') | |||
group = parser.add_mutually_exclusive_group(required=True) | |||
group.add_argument('--suffix', action='store_true', help='list primitive suffixes of all enabled common primitives') | |||
group.add_argument('--python', action='store_true', help='list Python paths of all enabled common primitives') | |||
group.add_argument('--files', action='store_true', help='list file paths of all enabled common primitives') | |||
args = parser.parse_args() | |||
entry_points = CaseSensitiveConfigParser() | |||
entry_points.read('entry_points.ini') | |||
for primitive_suffix, primitive_path in entry_points.items('d3m.primitives'): | |||
if args.python: | |||
print("d3m.primitives.{primitive_suffix}".format(primitive_suffix=primitive_suffix)) | |||
elif args.suffix: | |||
print(primitive_suffix) | |||
elif args.files: | |||
primitive_path = re.sub(':.+$', '', primitive_path) | |||
primitive_path = re.sub('\.', '/', primitive_path) | |||
print("{primitive_path}.py".format(primitive_path=primitive_path)) | |||
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/1.yaml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/1.yaml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/1.yaml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/1.yaml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../data_preprocessing.one_hot_encoder.MakerCommon/1.yaml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/pipeline_run.yml.gz |
@@ -1 +0,0 @@ | |||
../data_transformation.grouping_field_compose.Common/pipeline_run.yml.gz |
@@ -1,246 +0,0 @@ | |||
{ | |||
"context": "TESTING", | |||
"created": "2019-02-12T01:09:44.343543Z", | |||
"id": "d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde", | |||
"inputs": [ | |||
{ | |||
"name": "inputs" | |||
} | |||
], | |||
"outputs": [ | |||
{ | |||
"data": "steps.7.produce", | |||
"name": "output predictions" | |||
} | |||
], | |||
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", | |||
"steps": [ | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "inputs.0", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", | |||
"name": "Extract a DataFrame from a Dataset", | |||
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"parse_semantic_types": { | |||
"data": [ | |||
"http://schema.org/Boolean", | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float", | |||
"https://metadata.datadrivendiscovery.org/types/FloatVector", | |||
"http://schema.org/DateTime" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", | |||
"name": "Parses strings into their types", | |||
"python_path": "d3m.primitives.data_transformation.column_parser.Common", | |||
"version": "0.6.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/CategoricalData" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"exclude_columns": { | |||
"data": [ | |||
0 | |||
], | |||
"type": "VALUE" | |||
}, | |||
"semantic_types": { | |||
"data": [ | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/TrueTarget" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.3.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
}, | |||
"use_semantic_types": { | |||
"data": true, | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", | |||
"name": "sklearn.impute.SimpleImputer", | |||
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.5.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"outputs": { | |||
"data": "steps.4.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "259aa747-795c-435e-8e33-8c32a4c83c6b", | |||
"name": "LightGBM GBTree classifier", | |||
"python_path": "d3m.primitives.classification.light_gbm.Common", | |||
"version": "0.1.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.6.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"reference": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "8d38b340-f83f-4877-baaa-162f8e551736", | |||
"name": "Construct pipeline predictions output", | |||
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
} | |||
] | |||
} |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1,110 +0,0 @@ | |||
id: ccad0f9c-130e-4063-a91e-ea65a18cb041 | |||
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json | |||
source: | |||
name: Mitar | |||
created: "2019-06-05T11:48:52.806069Z" | |||
context: TESTING | |||
name: Random Forest classifier pipeline | |||
description: | | |||
A simple pipeline which runs Random Forest classifier on tabular data. | |||
inputs: | |||
- name: input dataset | |||
outputs: | |||
- name: predictions | |||
data: steps.5.produce | |||
steps: | |||
# Step 0. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e | |||
version: 0.2.0 | |||
python_path: d3m.primitives.data_transformation.denormalize.Common | |||
name: Denormalize datasets | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: inputs.0 | |||
outputs: | |||
- id: produce | |||
# Step 1. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65 | |||
version: 0.3.0 | |||
python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common | |||
name: Extract a DataFrame from a Dataset | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.0.produce | |||
outputs: | |||
- id: produce | |||
# Step 2. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: d510cb7a-1782-4f51-b44c-58f0236e47c7 | |||
version: 0.6.0 | |||
python_path: d3m.primitives.data_transformation.column_parser.Common | |||
name: Parses strings into their types | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.1.produce | |||
outputs: | |||
- id: produce | |||
# Step 3. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: d016df89-de62-3c53-87ed-c06bb6a23cde | |||
version: 2019.6.7 | |||
python_path: d3m.primitives.data_cleaning.imputer.SKlearn | |||
name: sklearn.impute.SimpleImputer | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.2.produce | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
use_semantic_types: | |||
type: VALUE | |||
data: true | |||
return_result: | |||
type: VALUE | |||
data: replace | |||
# Step 4. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 37c2b19d-bdab-4a30-ba08-6be49edcc6af | |||
version: 0.4.0 | |||
python_path: d3m.primitives.classification.random_forest.Common | |||
name: Random forest classifier | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.3.produce | |||
outputs: | |||
type: CONTAINER | |||
data: steps.3.produce | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
return_result: | |||
type: VALUE | |||
data: replace | |||
# Step 5. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 8d38b340-f83f-4877-baaa-162f8e551736 | |||
version: 0.3.0 | |||
python_path: d3m.primitives.data_transformation.construct_predictions.Common | |||
name: Construct pipeline predictions output | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.4.produce | |||
reference: | |||
type: CONTAINER | |||
data: steps.2.produce | |||
outputs: | |||
- id: produce |
@@ -1,246 +0,0 @@ | |||
{ | |||
"context": "TESTING", | |||
"created": "2019-02-12T01:33:29.921236Z", | |||
"id": "b7a24816-2518-4073-9c45-b97f2b2fee30", | |||
"inputs": [ | |||
{ | |||
"name": "inputs" | |||
} | |||
], | |||
"outputs": [ | |||
{ | |||
"data": "steps.7.produce", | |||
"name": "output predictions" | |||
} | |||
], | |||
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", | |||
"steps": [ | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "inputs.0", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", | |||
"name": "Extract a DataFrame from a Dataset", | |||
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"parse_semantic_types": { | |||
"data": [ | |||
"http://schema.org/Boolean", | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float", | |||
"https://metadata.datadrivendiscovery.org/types/FloatVector", | |||
"http://schema.org/DateTime" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", | |||
"name": "Parses strings into their types", | |||
"python_path": "d3m.primitives.data_transformation.column_parser.Common", | |||
"version": "0.6.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/CategoricalData" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"exclude_columns": { | |||
"data": [ | |||
0 | |||
], | |||
"type": "VALUE" | |||
}, | |||
"semantic_types": { | |||
"data": [ | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/TrueTarget" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.3.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
}, | |||
"use_semantic_types": { | |||
"data": true, | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", | |||
"name": "sklearn.impute.SimpleImputer", | |||
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.5.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"outputs": { | |||
"data": "steps.4.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "7476950e-4373-4cf5-a852-7e16afb8e098", | |||
"name": "XGBoost DART classifier", | |||
"python_path": "d3m.primitives.classification.xgboost_dart.Common", | |||
"version": "0.1.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.6.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"reference": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "8d38b340-f83f-4877-baaa-162f8e551736", | |||
"name": "Construct pipeline predictions output", | |||
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
} | |||
] | |||
} |
@@ -1,246 +0,0 @@ | |||
{ | |||
"context": "TESTING", | |||
"created": "2019-02-12T01:18:47.753202Z", | |||
"id": "4d402450-2562-48cc-93fd-719fb658c43c", | |||
"inputs": [ | |||
{ | |||
"name": "inputs" | |||
} | |||
], | |||
"outputs": [ | |||
{ | |||
"data": "steps.7.produce", | |||
"name": "output predictions" | |||
} | |||
], | |||
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", | |||
"steps": [ | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "inputs.0", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", | |||
"name": "Extract a DataFrame from a Dataset", | |||
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"parse_semantic_types": { | |||
"data": [ | |||
"http://schema.org/Boolean", | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float", | |||
"https://metadata.datadrivendiscovery.org/types/FloatVector", | |||
"http://schema.org/DateTime" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", | |||
"name": "Parses strings into their types", | |||
"python_path": "d3m.primitives.data_transformation.column_parser.Common", | |||
"version": "0.6.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/CategoricalData" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"exclude_columns": { | |||
"data": [ | |||
0 | |||
], | |||
"type": "VALUE" | |||
}, | |||
"semantic_types": { | |||
"data": [ | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/TrueTarget" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.3.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
}, | |||
"use_semantic_types": { | |||
"data": true, | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", | |||
"name": "sklearn.impute.SimpleImputer", | |||
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.5.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"outputs": { | |||
"data": "steps.4.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "fe0841b7-6e70-4bc3-a56c-0670a95ebc6a", | |||
"name": "XGBoost GBTree classifier", | |||
"python_path": "d3m.primitives.classification.xgboost_gbtree.Common", | |||
"version": "0.1.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.6.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"reference": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "8d38b340-f83f-4877-baaa-162f8e551736", | |||
"name": "Construct pipeline predictions output", | |||
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
} | |||
] | |||
} |
@@ -1,342 +0,0 @@ | |||
{ | |||
"id": "4ff2f21d-1bba-4c44-bb96-e05728bcf6ed", | |||
"name": "classification_template(imputer=d3m.primitives.data_cleaning.imputer.SKlearn, classifier=d3m.primitives.regression.random_forest.SKlearn)", | |||
"description": "To be used with NYU datamart.", | |||
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", | |||
"created": "2019-06-06T21:30:30Z", | |||
"context": "TESTING", | |||
"inputs": [ | |||
{ | |||
"name": "input dataset" | |||
} | |||
], | |||
"outputs": [ | |||
{ | |||
"data": "steps.12.produce", | |||
"name": "predictions" | |||
} | |||
], | |||
"steps": [ | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "fe0f1ac8-1d39-463a-b344-7bd498a31b91", | |||
"version": "0.1", | |||
"name": "Perform dataset augmentation using Datamart", | |||
"python_path": "d3m.primitives.data_augmentation.datamart_augmentation.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "inputs.0" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"hyperparams": { | |||
"system_identifier": { | |||
"type": "VALUE", | |||
"data": "NYU" | |||
}, | |||
"search_result": { | |||
"type": "VALUE", | |||
"data": "{\"augmentation\": {\"left_columns\": [[1]], \"left_columns_names\": [\"tpep_pickup_datetime\"], \"right_columns\": [[0]], \"type\": \"join\"}, \"id\": \"datamart.url.a3943fd7892d5d219012f889327c6661\", \"metadata\": {\"columns\": [{\"coverage\": [{\"range\": {\"gte\": 1451610000.0, \"lte\": 1540252800.0}}], \"mean\": 1495931400.0, \"name\": \"DATE\", \"semantic_types\": [\"http://schema.org/DateTime\"], \"stddev\": 25590011.431395352, \"structural_type\": \"http://schema.org/Text\"}, {\"name\": \"HOURLYSKYCONDITIONS\", \"semantic_types\": [], \"structural_type\": \"http://schema.org/Text\"}, {\"coverage\": [{\"range\": {\"gte\": -17.2, \"lte\": 37.8}}], \"mean\": 14.666224009096823, \"name\": \"HOURLYDRYBULBTEMPC\", \"semantic_types\": [], \"stddev\": 9.973788193915643, \"structural_type\": \"http://schema.org/Float\"}, {\"coverage\": [{\"range\": {\"gte\": 11.0, \"lte\": 100.0}}], \"mean\": 60.70849577647823, \"name\": \"HOURLYRelativeHumidity\", \"semantic_types\": [], \"stddev\": 18.42048051096981, \"structural_type\": \"http://schema.org/Float\"}, {\"coverage\": [{\"range\": {\"gte\": 0.0, \"lte\": 41.0}}], \"mean\": 10.68859649122807, \"name\": \"HOURLYWindSpeed\", \"semantic_types\": [], \"stddev\": 5.539675475162907, \"structural_type\": \"http://schema.org/Float\"}, {\"name\": \"HOURLYWindDirection\", \"semantic_types\": [], \"structural_type\": \"http://schema.org/Text\"}, {\"coverage\": [{\"range\": {\"gte\": 28.89, \"lte\": 30.81}}], \"mean\": 29.90760315139694, \"name\": \"HOURLYStationPressure\", \"semantic_types\": [\"https://metadata.datadrivendiscovery.org/types/PhoneNumber\"], \"stddev\": 0.24584097919742368, \"structural_type\": \"http://schema.org/Float\"}], \"date\": \"2019-01-22T01:54:58.281183Z\", \"description\": \"This data contains weather information for NY city around LaGuardia Airport from 2016 to 2018; weath...\", \"materialize\": {\"direct_url\": \"https://drive.google.com/uc?export=download&id=1jRwzZwEGMICE3n6-nwmVxMD2c0QCHad4\", \"identifier\": \"datamart.url\"}, \"name\": \"Newyork Weather Data around Airport 2016-18\", \"nb_rows\": 24624, \"size\": 1523693}, \"score\": 1.0, \"supplied_id\": \"DA_ny_taxi_demand_dataset_TRAIN\", \"supplied_resource_id\": \"learningData\"}" | |||
} | |||
} | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e", | |||
"version": "0.2.0", | |||
"name": "Denormalize datasets", | |||
"python_path": "d3m.primitives.data_transformation.denormalize.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.0.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
] | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", | |||
"version": "0.3.0", | |||
"name": "Extract a DataFrame from a Dataset", | |||
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.1.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
] | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", | |||
"version": "0.6.0", | |||
"name": "Parses strings into their types", | |||
"python_path": "d3m.primitives.data_transformation.column_parser.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.2.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
] | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"version": "0.3.0", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.3.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"hyperparams": { | |||
"semantic_types": { | |||
"type": "VALUE", | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/Attribute" | |||
] | |||
} | |||
} | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"name": "sklearn.impute.SimpleImputer", | |||
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", | |||
"version": "2019.11.13", | |||
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.4.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"hyperparams": { | |||
"strategy": { | |||
"type": "VALUE", | |||
"data": "most_frequent" | |||
} | |||
} | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"name": "sklearn.preprocessing.data.OneHotEncoder", | |||
"python_path": "d3m.primitives.data_transformation.one_hot_encoder.SKlearn", | |||
"version": "2019.11.13", | |||
"id": "c977e879-1bf5-3829-b5b0-39b00233aff5" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.5.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"hyperparams": { | |||
"handle_unknown": { | |||
"type": "VALUE", | |||
"data": "ignore" | |||
} | |||
} | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "eb5fe752-f22a-4090-948b-aafcef203bf5", | |||
"version": "0.2.0", | |||
"name": "Casts DataFrame", | |||
"python_path": "d3m.primitives.data_transformation.cast_to_type.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.6.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"hyperparams": { | |||
"type_to_cast": { | |||
"type": "VALUE", | |||
"data": "float" | |||
} | |||
} | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"version": "0.3.0", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.3.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"hyperparams": { | |||
"semantic_types": { | |||
"type": "VALUE", | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/TrueTarget" | |||
] | |||
} | |||
} | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "eb5fe752-f22a-4090-948b-aafcef203bf5", | |||
"version": "0.2.0", | |||
"name": "Casts DataFrame", | |||
"python_path": "d3m.primitives.data_transformation.cast_to_type.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.8.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
] | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"name": "sklearn.ensemble.forest.RandomForestRegressor", | |||
"python_path": "d3m.primitives.regression.random_forest.SKlearn", | |||
"version": "2019.11.13", | |||
"id": "f0fd7a62-09b5-3abc-93bb-f5f999f7cc80" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.7.produce" | |||
}, | |||
"outputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.9.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
] | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"version": "0.3.0", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.3.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"hyperparams": { | |||
"semantic_types": { | |||
"type": "VALUE", | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/Target", | |||
"https://metadata.datadrivendiscovery.org/types/PrimaryKey" | |||
] | |||
} | |||
} | |||
}, | |||
{ | |||
"type": "PRIMITIVE", | |||
"primitive": { | |||
"id": "8d38b340-f83f-4877-baaa-162f8e551736", | |||
"version": "0.3.0", | |||
"name": "Construct pipeline predictions output", | |||
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common" | |||
}, | |||
"arguments": { | |||
"inputs": { | |||
"type": "CONTAINER", | |||
"data": "steps.10.produce" | |||
}, | |||
"reference": { | |||
"type": "CONTAINER", | |||
"data": "steps.11.produce" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
] | |||
} | |||
] | |||
} |
@@ -1,123 +0,0 @@ | |||
id: 387d432a-9893-4558-b190-1c5e9e399dbf | |||
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json | |||
source: | |||
name: Jeffrey Gleason | |||
created: "2019-06-05T2:48:52.806069Z" | |||
context: TESTING | |||
name: Dataset sample test pipeline | |||
description: | | |||
A simple pipeline which runs Random Forest classifier on tabular data after sampling the dataset (50% of rows) | |||
inputs: | |||
- name: input dataset | |||
outputs: | |||
- name: predictions | |||
data: steps.6.produce | |||
steps: | |||
# Step 0. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 268315c1-7549-4aee-a4cc-28921cba74c0 | |||
version: 0.1.0 | |||
python_path: d3m.primitives.data_preprocessing.dataset_sample.Common | |||
name: Dataset sampling primitive | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: inputs.0 | |||
outputs: | |||
- id: produce | |||
# Step 1. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e | |||
version: 0.2.0 | |||
python_path: d3m.primitives.data_transformation.denormalize.Common | |||
name: Denormalize datasets | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.0.produce | |||
outputs: | |||
- id: produce | |||
# Step 2. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 4b42ce1e-9b98-4a25-b68e-fad13311eb65 | |||
version: 0.3.0 | |||
python_path: d3m.primitives.data_transformation.dataset_to_dataframe.Common | |||
name: Extract a DataFrame from a Dataset | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.1.produce | |||
outputs: | |||
- id: produce | |||
# Step 3. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: d510cb7a-1782-4f51-b44c-58f0236e47c7 | |||
version: 0.6.0 | |||
python_path: d3m.primitives.data_transformation.column_parser.Common | |||
name: Parses strings into their types | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.2.produce | |||
outputs: | |||
- id: produce | |||
# Step 4. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: d016df89-de62-3c53-87ed-c06bb6a23cde | |||
version: 2019.6.7 | |||
python_path: d3m.primitives.data_cleaning.imputer.SKlearn | |||
name: sklearn.impute.SimpleImputer | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.3.produce | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
use_semantic_types: | |||
type: VALUE | |||
data: true | |||
return_result: | |||
type: VALUE | |||
data: replace | |||
# Step 5. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 37c2b19d-bdab-4a30-ba08-6be49edcc6af | |||
version: 0.4.0 | |||
python_path: d3m.primitives.classification.random_forest.Common | |||
name: Random forest classifier | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.4.produce | |||
outputs: | |||
type: CONTAINER | |||
data: steps.4.produce | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
return_result: | |||
type: VALUE | |||
data: replace | |||
# Step 6. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 8d38b340-f83f-4877-baaa-162f8e551736 | |||
version: 0.3.0 | |||
python_path: d3m.primitives.data_transformation.construct_predictions.Common | |||
name: Construct pipeline predictions output | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.5.produce | |||
reference: | |||
type: CONTAINER | |||
data: steps.3.produce | |||
outputs: | |||
- id: produce |
@@ -1,300 +0,0 @@ | |||
{ | |||
"context": "TESTING", | |||
"created": "2019-02-12T02:10:00.929519Z", | |||
"id": "2b307634-f01e-412e-8d95-7e54afd4731f", | |||
"inputs": [ | |||
{ | |||
"name": "inputs" | |||
} | |||
], | |||
"outputs": [ | |||
{ | |||
"data": "steps.9.produce", | |||
"name": "output predictions" | |||
} | |||
], | |||
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", | |||
"steps": [ | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "inputs.0", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", | |||
"name": "Extract a DataFrame from a Dataset", | |||
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"parse_semantic_types": { | |||
"data": [ | |||
"http://schema.org/Boolean", | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float", | |||
"https://metadata.datadrivendiscovery.org/types/FloatVector", | |||
"http://schema.org/DateTime" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", | |||
"name": "Parses strings into their types", | |||
"python_path": "d3m.primitives.data_transformation.column_parser.Common", | |||
"version": "0.6.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/CategoricalData" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"exclude_columns": { | |||
"data": [ | |||
0 | |||
], | |||
"type": "VALUE" | |||
}, | |||
"semantic_types": { | |||
"data": [ | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/TrueTarget" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.3.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
}, | |||
"use_semantic_types": { | |||
"data": true, | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", | |||
"name": "sklearn.impute.SimpleImputer", | |||
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.2.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "eaec420d-46eb-4ddf-a2cd-b8097345ff3e", | |||
"name": "One-hot maker", | |||
"python_path": "d3m.primitives.data_preprocessing.one_hot_encoder.MakerCommon", | |||
"version": "0.2.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"left": { | |||
"data": "steps.6.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"right": { | |||
"data": "steps.5.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "aff6a77a-faa0-41c5-9595-de2e7f7c4760", | |||
"name": "Concatenate two dataframes", | |||
"python_path": "d3m.primitives.data_transformation.horizontal_concat.DataFrameCommon", | |||
"version": "0.2.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.7.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"outputs": { | |||
"data": "steps.4.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
}, | |||
"use_semantic_types": { | |||
"data": true, | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "1dd82833-5692-39cb-84fb-2455683075f3", | |||
"name": "sklearn.ensemble.forest.RandomForestClassifier", | |||
"python_path": "d3m.primitives.classification.random_forest.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.8.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"reference": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "8d38b340-f83f-4877-baaa-162f8e551736", | |||
"name": "Construct pipeline predictions output", | |||
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
} | |||
] | |||
} |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json |
@@ -1 +0,0 @@ | |||
../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json |
@@ -1 +0,0 @@ | |||
../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json |
@@ -1 +0,0 @@ | |||
{"id": "4ec215d1-6484-4502-a6dd-f659943ccb94", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T17:49:59.327063Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.7.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns.Common", "name": "Extracts columns", "digest": "7b9ba98e3b7b9d1d8e17547249c7a25cd8d58ec60d957217f772753e37526145"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [25]}}}, {"type": "PRIMITIVE", "primitive": {"id": "81d7e261-e25b-4721-b091-a31cd46e99ae", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns.Common", "name": "Extracts columns", "digest": "7b9ba98e3b7b9d1d8e17547249c7a25cd8d58ec60d957217f772753e37526145"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [6]}}}, {"type": "PRIMITIVE", "primitive": {"id": "09f252eb-215d-4e0b-9a60-fcd967f5e708", "version": "0.2.0", "python_path": "d3m.primitives.data_transformation.encoder.DistilTextEncoder", "name": "Text encoder", "digest": "e468d66d1eda057a61b2c79ecf5288f137778f47dac9eabdc60707a4941532a3"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"encoder_type": {"type": "VALUE", "data": "tfidf"}}}, {"type": "PRIMITIVE", "primitive": {"id": "e0ad06ce-b484-46b0-a478-c567e1ea7e02", "version": "0.2.0", "python_path": "d3m.primitives.learner.random_forest.DistilEnsembleForest", "name": "EnsembleForest", "digest": "4ba7a354b15ea626bf96aa771a2a3cba034ad5d0a8ccdbbf68bce2d828db1b4d"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "674a644333a3a481769591341591461b06de566fef7439010284739194e18af8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "a26edc0cc9bcf9121189186d621ff1b4cebb2afc76b6ef171d7d8194e55cf475"} |
@@ -1,71 +0,0 @@ | |||
from d3m import index | |||
from d3m.metadata.base import ArgumentType, Context | |||
from d3m.metadata.pipeline import Pipeline, PrimitiveStep | |||
# Creating pipeline | |||
pipeline_description = Pipeline() | |||
pipeline_description.add_input(name='inputs') | |||
# Step 0: dataset_to_dataframe | |||
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) | |||
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') | |||
step_0.add_output('produce') | |||
pipeline_description.add_step(step_0) | |||
# Step 1: Simple profiler primitive | |||
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.schema_discovery.profiler.Common')) | |||
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||
step_1.add_output('produce') | |||
pipeline_description.add_step(step_1) | |||
# Step 2: column_parser | |||
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) | |||
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') | |||
step_2.add_output('produce') | |||
pipeline_description.add_step(step_2) | |||
# Step 3: Extract text column explicitly | |||
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns.Common')) | |||
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') | |||
step_3.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [25]) | |||
step_3.add_output('produce') | |||
pipeline_description.add_step(step_3) | |||
# Step 4: Extract target column explicitly | |||
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns.Common')) | |||
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') | |||
step_4.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [6]) | |||
step_4.add_output('produce') | |||
pipeline_description.add_step(step_4) | |||
# Step 5: encode text column | |||
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.encoder.DistilTextEncoder')) | |||
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') | |||
step_5.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') | |||
step_5.add_hyperparameter(name='encoder_type', argument_type=ArgumentType.VALUE, data = 'tfidf') | |||
step_5.add_output('produce') | |||
pipeline_description.add_step(step_5) | |||
# Step 6: classifier | |||
step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.learner.random_forest.DistilEnsembleForest')) | |||
step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') | |||
step_6.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') | |||
step_6.add_output('produce') | |||
pipeline_description.add_step(step_6) | |||
# Step 7: construct output | |||
step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) | |||
step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') | |||
step_7.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||
step_7.add_output('produce') | |||
pipeline_description.add_step(step_7) | |||
# Final Output | |||
pipeline_description.add_output(name='output predictions', data_reference='steps.7.produce') | |||
# Output json pipeline | |||
blob = pipeline_description.to_json() | |||
filename = blob[8:44] + '.json' | |||
with open(filename, 'w') as outfile: | |||
outfile.write(blob) | |||
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1 +0,0 @@ | |||
../classification.light_gbm.DataFrameCommon/d2473bbc-7839-4deb-9ba4-4ff4bc9b0bde.json |
@@ -1 +0,0 @@ | |||
{"id": "b523335c-0c47-4d02-a582-f69609cde1e8", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T19:51:17.782254Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.9.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "79674d68-9b93-4359-b385-7b5f60645b06", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_structural_types.Common", "name": "Extracts columns by structural type", "digest": "7805010b9581bb96c035fefa5943209c69a1e234f10d9057d487af42c0fd4830"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "f6315ca9-ca39-4e13-91ba-1964ee27281c", "version": "0.1.0", "python_path": "d3m.primitives.data_preprocessing.one_hot_encoder.PandasCommon", "name": "Pandas one hot encoder", "digest": "ed1217d4d7c017d8239b4f958c8e6ca0b3b67966ccb50cc5c578a9f14e465ec0"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"use_columns": {"type": "VALUE", "data": [2, 5]}}}, {"type": "PRIMITIVE", "primitive": {"id": "3b09ba74-cc90-4f22-9e0a-0cf4f29a7e28", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.remove_columns.Common", "name": "Removes columns", "digest": "a725d149595186b85f1dea2bacbf4b853712b6a50eddb7c4c2295fabc3a04df1"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.4.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"columns": {"type": "VALUE", "data": [25]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "505df38f9be4964ff19683ab3e185f19333fb35c26121c12a1c55bddd9d38f72"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Attribute"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", "name": "Extracts columns by semantic type", "digest": "505df38f9be4964ff19683ab3e185f19333fb35c26121c12a1c55bddd9d38f72"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.5.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"semantic_types": {"type": "VALUE", "data": ["https://metadata.datadrivendiscovery.org/types/Target"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "37c2b19d-bdab-4a30-ba08-6be49edcc6af", "version": "0.4.0", "python_path": "d3m.primitives.classification.random_forest.Common", "name": "Random forest classifier", "digest": "f5f702fc561775a6064c64c008a519f605eb00ca80f59a5d5e39b1340c7c015e"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.6.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.7.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "8d38b340-f83f-4877-baaa-162f8e551736", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.construct_predictions.Common", "name": "Construct pipeline predictions output", "digest": "674a644333a3a481769591341591461b06de566fef7439010284739194e18af8"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.8.produce"}, "reference": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "7929f79fa8e2aaddcbe66d0f592525081280549e0713198e583728ff88b0f895"} |
@@ -1,83 +0,0 @@ | |||
from d3m import index | |||
from d3m.metadata.base import ArgumentType, Context | |||
from d3m.metadata.pipeline import Pipeline, PrimitiveStep | |||
# Creating pipeline | |||
pipeline_description = Pipeline() | |||
pipeline_description.add_input(name='inputs') | |||
# Step 0: dataset_to_dataframe | |||
step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.dataset_to_dataframe.Common')) | |||
step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') | |||
step_0.add_output('produce') | |||
pipeline_description.add_step(step_0) | |||
# Step 1: Simple profiler primitive | |||
step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.schema_discovery.profiler.Common')) | |||
step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||
step_1.add_output('produce') | |||
pipeline_description.add_step(step_1) | |||
# Step 2: Extract columns by structural type | |||
step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_structural_types.Common')) | |||
step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') | |||
step_2.add_output('produce') | |||
pipeline_description.add_step(step_2) | |||
# Step 3: column_parser | |||
step_3 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.column_parser.Common')) | |||
step_3.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.2.produce') | |||
step_3.add_output('produce') | |||
pipeline_description.add_step(step_3) | |||
# Step 4 one hot encode | |||
step_4 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_preprocessing.one_hot_encoder.PandasCommon')) | |||
step_4.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.3.produce') | |||
step_4.add_hyperparameter(name='use_columns', argument_type=ArgumentType.VALUE, data = [2,5]) | |||
step_4.add_output('produce') | |||
pipeline_description.add_step(step_4) | |||
# Step 5 remove text | |||
step_5 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.remove_columns.Common')) | |||
step_5.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.4.produce') | |||
step_5.add_hyperparameter(name='columns', argument_type=ArgumentType.VALUE, data = [25]) | |||
step_5.add_output('produce') | |||
pipeline_description.add_step(step_5) | |||
# Step 6 extract attributes | |||
step_6 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) | |||
step_6.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') | |||
step_6.add_hyperparameter(name="semantic_types", argument_type=ArgumentType.VALUE, data=["https://metadata.datadrivendiscovery.org/types/Attribute"],) | |||
step_6.add_output('produce') | |||
pipeline_description.add_step(step_6) | |||
# Step 7 extract target | |||
step_7 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common')) | |||
step_7.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.5.produce') | |||
step_7.add_hyperparameter(name="semantic_types", argument_type=ArgumentType.VALUE, data=["https://metadata.datadrivendiscovery.org/types/Target"],) | |||
step_7.add_output('produce') | |||
pipeline_description.add_step(step_7) | |||
# Step 8: classifier | |||
step_8 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.classification.random_forest.Common')) | |||
step_8.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.6.produce') | |||
step_8.add_argument(name='outputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.7.produce') | |||
step_8.add_output('produce') | |||
pipeline_description.add_step(step_8) | |||
# Step 9: construct output | |||
step_9 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.data_transformation.construct_predictions.Common')) | |||
step_9.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.8.produce') | |||
step_9.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') | |||
step_9.add_output('produce') | |||
pipeline_description.add_step(step_9) | |||
# Final Output | |||
pipeline_description.add_output(name='output predictions', data_reference='steps.9.produce') | |||
# Output json pipeline | |||
blob = pipeline_description.to_json() | |||
filename = blob[8:44] + '.json' | |||
with open(filename, 'w') as outfile: | |||
outfile.write(blob) | |||
@@ -1 +0,0 @@ | |||
{"id": "a8c40699-c48d-4f12-aa18-639c5fb6baae", "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", "created": "2020-01-15T19:35:58.976691Z", "inputs": [{"name": "inputs"}], "outputs": [{"data": "steps.4.produce", "name": "output predictions"}], "steps": [{"type": "PRIMITIVE", "primitive": {"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", "version": "0.3.0", "python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", "name": "Extract a DataFrame from a Dataset", "digest": "a1a0109be87a6ae578fd20e9d46c70c806059076c041b80b6314e7e41cf62d82"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "inputs.0"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "e193afa1-b45e-4d29-918f-5bb1fa3b88a7", "version": "0.2.0", "python_path": "d3m.primitives.schema_discovery.profiler.Common", "name": "Determine missing semantic types for columns automatically", "digest": "a3d51cbc0bf18168114c1c8f12c497d691dbe30b71667f355f30c13a9a08ba32"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.0.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", "version": "0.6.0", "python_path": "d3m.primitives.data_transformation.column_parser.Common", "name": "Parses strings into their types", "digest": "b020e14e3d4f1e4266aa8a0680d83afcf2862300549c6f6c903742d7d171f879"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.1.produce"}}, "outputs": [{"id": "produce"}], "hyperparams": {"parse_semantic_types": {"type": "VALUE", "data": ["http://schema.org/Boolean", "http://schema.org/Integer", "http://schema.org/Float", "https://metadata.datadrivendiscovery.org/types/FloatVector", "http://schema.org/DateTime"]}}}, {"type": "PRIMITIVE", "primitive": {"id": "59db88b9-dd81-4e50-8f43-8f2af959560b", "version": "0.1.0", "python_path": "d3m.primitives.data_transformation.grouping_field_compose.Common", "name": "Grouping Field Compose", "digest": "e93815bfdb1c82ce0e2fa61f092d6ee9bcf39367a27072accbb9f0dd9189fb03"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.2.produce"}}, "outputs": [{"id": "produce"}]}, {"type": "PRIMITIVE", "primitive": {"id": "76b5a479-c209-4d94-92b5-7eba7a4d4499", "version": "1.0.2", "python_path": "d3m.primitives.time_series_forecasting.vector_autoregression.VAR", "name": "VAR", "digest": "7e22a1e7fe228114a5788f16a8d3c7709ed3a98a90e9cc82e3b80ab5f232d352"}, "arguments": {"inputs": {"type": "CONTAINER", "data": "steps.3.produce"}, "outputs": {"type": "CONTAINER", "data": "steps.3.produce"}}, "outputs": [{"id": "produce"}]}], "digest": "da2c7d2605256f263ca4725fe7385be5e027a3ddadc8dbf7523ff98bcd016005"} |
@@ -1,100 +0,0 @@ | |||
from d3m import index | |||
from d3m.metadata.base import ArgumentType | |||
from d3m.metadata.pipeline import Pipeline, PrimitiveStep | |||
# Creating pipeline | |||
pipeline_description = Pipeline() | |||
pipeline_description.add_input(name="inputs") | |||
# Step 0: DS to DF on input DS | |||
step_0 = PrimitiveStep( | |||
primitive=index.get_primitive( | |||
"d3m.primitives.data_transformation.dataset_to_dataframe.Common" | |||
) | |||
) | |||
step_0.add_argument( | |||
name="inputs", argument_type=ArgumentType.CONTAINER, data_reference="inputs.0" | |||
) | |||
step_0.add_output("produce") | |||
pipeline_description.add_step(step_0) | |||
# Step 1: Simple Profiler Column Role Annotation | |||
step_1 = PrimitiveStep( | |||
primitive=index.get_primitive("d3m.primitives.schema_discovery.profiler.Common") | |||
) | |||
step_1.add_argument( | |||
name="inputs", | |||
argument_type=ArgumentType.CONTAINER, | |||
data_reference="steps.0.produce", | |||
) | |||
step_1.add_output("produce") | |||
pipeline_description.add_step(step_1) | |||
# Step 2: column parser on input DF | |||
step_2 = PrimitiveStep( | |||
primitive=index.get_primitive( | |||
"d3m.primitives.data_transformation.column_parser.Common" | |||
) | |||
) | |||
step_2.add_argument( | |||
name="inputs", | |||
argument_type=ArgumentType.CONTAINER, | |||
data_reference="steps.1.produce", | |||
) | |||
step_2.add_output("produce") | |||
step_2.add_hyperparameter( | |||
name="parse_semantic_types", | |||
argument_type=ArgumentType.VALUE, | |||
data=[ | |||
"http://schema.org/Boolean", | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float", | |||
"https://metadata.datadrivendiscovery.org/types/FloatVector", | |||
"http://schema.org/DateTime", | |||
], | |||
) | |||
pipeline_description.add_step(step_2) | |||
# Step 3: Grouping Field Compose | |||
step_3 = PrimitiveStep( | |||
primitive=index.get_primitive( | |||
"d3m.primitives.data_transformation.grouping_field_compose.Common" | |||
) | |||
) | |||
step_3.add_argument( | |||
name="inputs", | |||
argument_type=ArgumentType.CONTAINER, | |||
data_reference="steps.2.produce", | |||
) | |||
step_3.add_output("produce") | |||
pipeline_description.add_step(step_3) | |||
# Step 4: forecasting primitive | |||
step_4 = PrimitiveStep( | |||
primitive=index.get_primitive( | |||
"d3m.primitives.time_series_forecasting.vector_autoregression.VAR" | |||
) | |||
) | |||
step_4.add_argument( | |||
name="inputs", | |||
argument_type=ArgumentType.CONTAINER, | |||
data_reference="steps.3.produce", | |||
) | |||
step_4.add_argument( | |||
name="outputs", | |||
argument_type=ArgumentType.CONTAINER, | |||
data_reference="steps.3.produce", | |||
) | |||
step_4.add_output("produce") | |||
pipeline_description.add_step(step_4) | |||
# Final Output | |||
pipeline_description.add_output( | |||
name="output predictions", data_reference="steps.4.produce" | |||
) | |||
# Output json pipeline | |||
blob = pipeline_description.to_json() | |||
filename = blob[8:44] + ".json" | |||
with open(filename, "w") as outfile: | |||
outfile.write(blob) |
@@ -1 +0,0 @@ | |||
../data_preprocessing.one_hot_encoder.MakerCommon/2b307634-f01e-412e-8d95-7e54afd4731f.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1,272 +0,0 @@ | |||
{ | |||
"context": "TESTING", | |||
"created": "2019-02-12T02:01:52.663008Z", | |||
"id": "11ee9290-992d-4e48-97ed-1a6e4c15f92f", | |||
"inputs": [ | |||
{ | |||
"name": "inputs" | |||
} | |||
], | |||
"outputs": [ | |||
{ | |||
"data": "steps.8.produce", | |||
"name": "output predictions" | |||
} | |||
], | |||
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", | |||
"steps": [ | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "inputs.0", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", | |||
"name": "Extract a DataFrame from a Dataset", | |||
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"parse_semantic_types": { | |||
"data": [ | |||
"http://schema.org/Boolean", | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float", | |||
"https://metadata.datadrivendiscovery.org/types/FloatVector", | |||
"http://schema.org/DateTime" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", | |||
"name": "Parses strings into their types", | |||
"python_path": "d3m.primitives.data_transformation.column_parser.Common", | |||
"version": "0.6.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"separator": { | |||
"data": "----", | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "7b067a78-4ad4-411d-9cf9-87bcee38ac73", | |||
"name": "Rename all the duplicated name column in DataFrame", | |||
"python_path": "d3m.primitives.data_transformation.rename_duplicate_name.DataFrameCommon", | |||
"version": "0.2.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.2.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/CategoricalData" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.2.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"exclude_columns": { | |||
"data": [ | |||
0 | |||
], | |||
"type": "VALUE" | |||
}, | |||
"semantic_types": { | |||
"data": [ | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/TrueTarget" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.4.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
}, | |||
"use_semantic_types": { | |||
"data": true, | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", | |||
"name": "sklearn.impute.SimpleImputer", | |||
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.6.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"outputs": { | |||
"data": "steps.5.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "1dd82833-5692-39cb-84fb-2455683075f3", | |||
"name": "sklearn.ensemble.forest.RandomForestClassifier", | |||
"python_path": "d3m.primitives.classification.random_forest.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.7.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"reference": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "8d38b340-f83f-4877-baaa-162f8e551736", | |||
"name": "Construct pipeline predictions output", | |||
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
} | |||
] | |||
} |
@@ -1,83 +0,0 @@ | |||
id: 5bed1f23-ac17-4b52-9d06-a5b77a6aea51 | |||
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json | |||
source: | |||
name: Jeffrey Gleason | |||
created: "2019-04-08T16:18:27.250294Z" | |||
context: TESTING | |||
name: K-fold split of timeseries datasets | |||
description: | | |||
K-fold split of timeseries datasets for cross-validation. | |||
inputs: | |||
- name: folds | |||
- name: full dataset | |||
outputs: | |||
- name: train datasets | |||
data: steps.0.produce | |||
- name: test datasets | |||
data: steps.2.produce | |||
- name: score datasets | |||
data: steps.1.produce | |||
steps: | |||
# Step 0. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 002f9ad1-46e3-40f4-89ed-eeffbb3a102b | |||
version: 0.1.0 | |||
python_path: d3m.primitives.evaluation.kfold_time_series_split.Common | |||
name: K-fold cross-validation timeseries dataset splits | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: inputs.0 | |||
dataset: | |||
type: CONTAINER | |||
data: inputs.1 | |||
outputs: | |||
- id: produce | |||
- id: produce_score_data | |||
# Step 1. We redact privileged attributes for both score and test splits. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 | |||
version: 0.2.0 | |||
python_path: d3m.primitives.evaluation.redact_columns.Common | |||
name: Redact columns for evaluation | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.0.produce_score_data | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/PrivilegedData | |||
add_semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData | |||
- https://metadata.datadrivendiscovery.org/types/MissingData | |||
# Step 2. We further redact targets in test split. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 | |||
version: 0.2.0 | |||
python_path: d3m.primitives.evaluation.redact_columns.Common | |||
name: Redact columns for evaluation | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.1.produce | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/TrueTarget | |||
add_semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/RedactedTarget | |||
- https://metadata.datadrivendiscovery.org/types/MissingData |
@@ -1,108 +0,0 @@ | |||
# todo change name | |||
id: 5bed1f23-ac17-4b52-9d06-a5b77a6aea51 | |||
schema: https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json | |||
source: | |||
name: Jeffrey Gleason | |||
created: "2019-12-19T16:29:34.702933Z" | |||
context: TESTING | |||
name: K-fold split of timeseries datasets | |||
description: | | |||
K-fold split of timeseries datasets for cross-validation. | |||
inputs: | |||
- name: folds | |||
- name: full dataset | |||
outputs: | |||
- name: train datasets | |||
data: steps.2.produce | |||
- name: test datasets | |||
data: steps.4.produce | |||
- name: score datasets | |||
data: steps.3.produce | |||
steps: | |||
# Step 0. Simon Data Typing primitive to infer DateTime column | |||
- type: PRIMITIVE | |||
primitive: | |||
id: d2fa8df2-6517-3c26-bafc-87b701c4043a | |||
version: 1.2.2 | |||
python_path: d3m.primitives.data_cleaning.column_type_profiler.Simon | |||
name: simon | |||
# Step 1. Mapped Simon Data Typing primitive to infer DateTime column | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 5bef5738-1638-48d6-9935-72445f0eecdc | |||
version: 0.1.0 | |||
python_path: d3m.primitives.operator.dataset_map.DataFrameCommon | |||
name: Map DataFrame resources to new resources using provided primitive | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: inputs.1 | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
primitive: | |||
type: PRIMITIVE | |||
data: 0 | |||
# Step 2. K-fold cross-validation timeseries dataset splits | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 002f9ad1-46e3-40f4-89ed-eeffbb3a102b | |||
version: 0.1.0 | |||
python_path: d3m.primitives.evaluation.kfold_time_series_split.Common | |||
name: K-fold cross-validation timeseries dataset splits | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: inputs.0 | |||
dataset: | |||
type: CONTAINER | |||
data: steps.1.produce | |||
outputs: | |||
- id: produce | |||
- id: produce_score_data | |||
# Step 3. We redact privileged attributes for both score and test splits. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 | |||
version: 0.2.0 | |||
python_path: d3m.primitives.evaluation.redact_columns.Common | |||
name: Redact columns for evaluation | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.2.produce_score_data | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/PrivilegedData | |||
add_semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/RedactedPrivilegedData | |||
- https://metadata.datadrivendiscovery.org/types/MissingData | |||
# Step 4. We further redact targets in test split. | |||
- type: PRIMITIVE | |||
primitive: | |||
id: 744c4090-e2f6-489e-8efc-8b1e051bfad6 | |||
version: 0.2.0 | |||
python_path: d3m.primitives.evaluation.redact_columns.Common | |||
name: Redact columns for evaluation | |||
arguments: | |||
inputs: | |||
type: CONTAINER | |||
data: steps.3.produce | |||
outputs: | |||
- id: produce | |||
hyperparams: | |||
semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/TrueTarget | |||
add_semantic_types: | |||
type: VALUE | |||
data: | |||
- https://metadata.datadrivendiscovery.org/types/RedactedTarget | |||
- https://metadata.datadrivendiscovery.org/types/MissingData |
@@ -1,247 +0,0 @@ | |||
{ | |||
"context": "TESTING", | |||
"created": "2019-02-12T01:35:59.402796Z", | |||
"id": "0f636602-6299-411b-9873-4b974cd393ba", | |||
"inputs": [ | |||
{ | |||
"name": "inputs" | |||
} | |||
], | |||
"outputs": [ | |||
{ | |||
"data": "steps.7.produce", | |||
"name": "output predictions" | |||
} | |||
], | |||
"schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", | |||
"steps": [ | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "inputs.0", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4b42ce1e-9b98-4a25-b68e-fad13311eb65", | |||
"name": "Extract a DataFrame from a Dataset", | |||
"python_path": "d3m.primitives.data_transformation.dataset_to_dataframe.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"parse_semantic_types": { | |||
"data": [ | |||
"http://schema.org/Boolean", | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float", | |||
"https://metadata.datadrivendiscovery.org/types/FloatVector", | |||
"http://schema.org/DateTime" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d510cb7a-1782-4f51-b44c-58f0236e47c7", | |||
"name": "Parses strings into their types", | |||
"python_path": "d3m.primitives.data_transformation.column_parser.Common", | |||
"version": "0.6.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/CategoricalData" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"exclude_columns": { | |||
"data": [ | |||
0 | |||
], | |||
"type": "VALUE" | |||
}, | |||
"semantic_types": { | |||
"data": [ | |||
"http://schema.org/Integer", | |||
"http://schema.org/Float" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.0.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"semantic_types": { | |||
"data": [ | |||
"https://metadata.datadrivendiscovery.org/types/TrueTarget" | |||
], | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "4503a4c6-42f7-45a1-a1d4-ed69699cf5e1", | |||
"name": "Extracts columns by semantic type", | |||
"python_path": "d3m.primitives.data_transformation.extract_columns_by_semantic_types.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.3.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
}, | |||
"use_semantic_types": { | |||
"data": true, | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "d016df89-de62-3c53-87ed-c06bb6a23cde", | |||
"name": "sklearn.impute.SimpleImputer", | |||
"python_path": "d3m.primitives.data_cleaning.imputer.SKlearn", | |||
"version": "2019.6.7" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.5.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"outputs": { | |||
"data": "steps.4.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"hyperparams": { | |||
"return_result": { | |||
"data": "replace", | |||
"type": "VALUE" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "cdbb80e4-e9de-4caa-a710-16b5d727b959", | |||
"name": "XGBoost GBTree regressor", | |||
"python_path": "d3m.primitives.regression.xgboost_gbtree.Common", | |||
"version": "0.1.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
}, | |||
{ | |||
"arguments": { | |||
"inputs": { | |||
"data": "steps.6.produce", | |||
"type": "CONTAINER" | |||
}, | |||
"reference": { | |||
"data": "steps.1.produce", | |||
"type": "CONTAINER" | |||
} | |||
}, | |||
"outputs": [ | |||
{ | |||
"id": "produce" | |||
} | |||
], | |||
"primitive": { | |||
"id": "8d38b340-f83f-4877-baaa-162f8e551736", | |||
"name": "Construct pipeline predictions output", | |||
"python_path": "d3m.primitives.data_transformation.construct_predictions.Common", | |||
"version": "0.3.0" | |||
}, | |||
"type": "PRIMITIVE" | |||
} | |||
] | |||
} |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns.Common/4ec215d1-6484-4502-a6dd-f659943ccb94.json |
@@ -1 +0,0 @@ | |||
../data_transformation.grouping_field_compose.Common/a8c40699-c48d-4f12-aa18-639c5fb6baae.json |
@@ -1 +0,0 @@ | |||
../data_transformation.extract_columns_by_structural_types.Common/b523335c-0c47-4d02-a582-f69609cde1e8.json |
@@ -1,44 +0,0 @@ | |||
#!/bin/bash | |||
mkdir -p results | |||
overall_result="0" | |||
while IFS= read -r pipeline_run_file; do | |||
pipeline_run_name="$(dirname "$pipeline_run_file")/$(basename -s .yml.gz "$(basename -s .yaml.gz "$pipeline_run_file")")" | |||
primitive_name="$(basename "$(dirname "$pipeline_run_file")")" | |||
if [[ -L "$pipeline_run_file" ]]; then | |||
echo ">>> Skipping '$pipeline_run_file'." | |||
continue | |||
else | |||
mkdir -p "results/$pipeline_run_name" | |||
fi | |||
pipelines_path="pipelines/$primitive_name" | |||
if [[ ! -d "$pipelines_path" ]]; then | |||
echo ">>> ERROR: Could not find pipelines for '$pipeline_run_file'." | |||
overall_result="1" | |||
continue | |||
fi | |||
echo ">>> Running '$pipeline_run_file'." | |||
python3 -m d3m --pipelines-path "$pipelines_path" \ | |||
runtime \ | |||
--datasets /data/datasets --volumes /data/static_files \ | |||
fit-score --input-run "$pipeline_run_file" \ | |||
--output "results/$pipeline_run_name/predictions.csv" \ | |||
--scores "results/$pipeline_run_name/scores.csv" \ | |||
--output-run "results/$pipeline_run_name/pipeline_runs.yaml" | |||
result="$?" | |||
if [[ "$result" -eq 0 ]]; then | |||
echo ">>> SUCCESS ($pipeline_run_file)" | |||
else | |||
echo ">>> ERROR ($pipeline_run_file)" | |||
overall_result="1" | |||
fi | |||
done < <(find pipeline_runs -name '*.yml.gz' -or -name '*.yaml.gz') | |||
exit "$overall_result" |
@@ -1,11 +0,0 @@ | |||
#!/usr/bin/env python3 | |||
import sys | |||
import unittest | |||
runner = unittest.TextTestRunner(verbosity=1) | |||
tests = unittest.TestLoader().discover('tests') | |||
if not runner.run(tests).wasSuccessful(): | |||
sys.exit(1) |
@@ -1,28 +0,0 @@ | |||
[pycodestyle] | |||
max-line-length = 200 | |||
[metadata] | |||
description-file = README.md | |||
[mypy] | |||
warn_redundant_casts = True | |||
# TODO: Enable back once false positives are fixed. | |||
# See: https://github.com/python/mypy/issues/4412 | |||
#warn_unused_ignores = True | |||
warn_unused_configs = True | |||
disallow_untyped_defs = True | |||
# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 | |||
[mypy-d3m.container.list] | |||
ignore_errors = True | |||
# TODO: Remove once this is fixed: https://github.com/python/mypy/issues/4300 | |||
[mypy-d3m.metadata.hyperparams] | |||
ignore_errors = True | |||
# TODO: Remove once this is fixed: https://github.com/python/mypy/pull/4384#issuecomment-354033177 | |||
[mypy-d3m.primitive_interfaces.distance] | |||
ignore_errors = True | |||
[mypy-common_primitives.slacker.*] | |||
ignore_errors = True |
@@ -1,65 +0,0 @@ | |||
import os | |||
import sys | |||
from setuptools import setup, find_packages | |||
PACKAGE_NAME = 'common_primitives' | |||
MINIMUM_PYTHON_VERSION = 3, 6 | |||
def check_python_version(): | |||
"""Exit when the Python version is too low.""" | |||
if sys.version_info < MINIMUM_PYTHON_VERSION: | |||
sys.exit("Python {}.{}+ is required.".format(*MINIMUM_PYTHON_VERSION)) | |||
def read_package_variable(key): | |||
"""Read the value of a variable from the package without importing.""" | |||
module_path = os.path.join(PACKAGE_NAME, '__init__.py') | |||
with open(module_path) as module: | |||
for line in module: | |||
parts = line.strip().split(' ') | |||
if parts and parts[0] == key: | |||
return parts[-1].strip("'") | |||
raise KeyError("'{0}' not found in '{1}'".format(key, module_path)) | |||
def read_readme(): | |||
with open(os.path.join(os.path.dirname(__file__), 'README.md'), encoding='utf8') as file: | |||
return file.read() | |||
def read_entry_points(): | |||
with open('entry_points.ini') as entry_points: | |||
return entry_points.read() | |||
check_python_version() | |||
version = read_package_variable('__version__') | |||
setup( | |||
name=PACKAGE_NAME, | |||
version=version, | |||
description='D3M common primitives', | |||
author=read_package_variable('__author__'), | |||
packages=find_packages(exclude=['contrib', 'docs', 'tests*']), | |||
data_files=[('./', ['./entry_points.ini'])], | |||
install_requires=[ | |||
'd3m', | |||
'pandas', | |||
'scikit-learn', | |||
'numpy', | |||
'lightgbm>=2.2.2,<=2.3.0', | |||
'opencv-python-headless<=4.1.1.26,>=4.1', | |||
'imageio>=2.3.0,<=2.6.0', | |||
'pillow==6.2.1', | |||
'xgboost>=0.81,<=0.90', | |||
], | |||
entry_points=read_entry_points(), | |||
url='https://gitlab.com/datadrivendiscovery/common-primitives', | |||
long_description=read_readme(), | |||
long_description_content_type='text/markdown', | |||
license='Apache-2.0', | |||
classifiers=[ | |||
'License :: OSI Approved :: Apache Software License', | |||
], | |||
) |
@@ -1,2 +0,0 @@ | |||
.pyc | |||
__pycache__ |
@@ -1,31 +0,0 @@ | |||
scikit-learn==0.22.0 | |||
pytypes==1.0b5 | |||
frozendict==1.2 | |||
numpy>=1.15.4,<=1.18.1 | |||
jsonschema==2.6.0 | |||
requests>=2.19.1,<=2.22.0 | |||
strict-rfc3339==0.7 | |||
rfc3987==1.3.8 | |||
webcolors>=1.8.1,<=1.10 | |||
dateparser>=0.7.0,<=0.7.2 | |||
python-dateutil==2.8.1 | |||
pandas==0.25 | |||
typing-inspect==0.5.0 | |||
GitPython>=2.1.11,<=3.0.5 | |||
jsonpath-ng==1.4.3 | |||
custom-inherit>=2.2.0,<=2.2.2 | |||
PyYAML>=5.1,<=5.3 | |||
pycurl>=7.43.0.2,<=7.43.0.3 | |||
pyarrow==0.15.1 | |||
gputil>=1.3.0,<=1.4.0 | |||
pyrsistent>=0.14.11,<=0.15.7 | |||
scipy>=1.2.1,<=1.4.1 | |||
openml==0.10.1 | |||
lightgbm>=2.2.2,<=2.3.0 | |||
opencv-python-headless<=4.1.1.26,>=4.1 | |||
imageio>=2.3.0,<=2.6.0 | |||
pillow==6.2.1 | |||
xgboost>=0.81,<=0.90 | |||
Jinja2==2.9.4 | |||
simplejson==3.12.0 | |||
gitdb2==2.0.6 |
@@ -1,106 +0,0 @@ | |||
import os | |||
from setuptools import setup, find_packages | |||
PACKAGE_NAME = 'sklearn_wrap' | |||
def read_package_variable(key): | |||
"""Read the value of a variable from the package without importing.""" | |||
module_path = os.path.join(PACKAGE_NAME, '__init__.py') | |||
with open(module_path) as module: | |||
for line in module: | |||
parts = line.strip().split(' ') | |||
if parts and parts[0] == key: | |||
return parts[-1].strip("'") | |||
assert False, "'{0}' not found in '{1}'".format(key, module_path) | |||
setup( | |||
name=PACKAGE_NAME, | |||
version=read_package_variable('__version__'), | |||
description='Primitives created using the Sklearn auto wrapper', | |||
author=read_package_variable('__author__'), | |||
packages=find_packages(exclude=['contrib', 'docs', 'tests*']), | |||
install_requires=[ | |||
'd3m', | |||
'Jinja2==2.9.4', | |||
'simplejson==3.12.0', | |||
'scikit-learn==0.22.0', | |||
], | |||
url='https://gitlab.datadrivendiscovery.org/jpl/sklearn-wrapping', | |||
entry_points = { | |||
'd3m.primitives': [ | |||
'data_cleaning.string_imputer.SKlearn = sklearn_wrap.SKStringImputer:SKStringImputer', | |||
'classification.gradient_boosting.SKlearn = sklearn_wrap.SKGradientBoostingClassifier:SKGradientBoostingClassifier', | |||
'classification.quadratic_discriminant_analysis.SKlearn = sklearn_wrap.SKQuadraticDiscriminantAnalysis:SKQuadraticDiscriminantAnalysis', | |||
'classification.decision_tree.SKlearn = sklearn_wrap.SKDecisionTreeClassifier:SKDecisionTreeClassifier', | |||
'classification.sgd.SKlearn = sklearn_wrap.SKSGDClassifier:SKSGDClassifier', | |||
'classification.nearest_centroid.SKlearn = sklearn_wrap.SKNearestCentroid:SKNearestCentroid', | |||
'classification.mlp.SKlearn = sklearn_wrap.SKMLPClassifier:SKMLPClassifier', | |||
'classification.bagging.SKlearn = sklearn_wrap.SKBaggingClassifier:SKBaggingClassifier', | |||
'classification.linear_svc.SKlearn = sklearn_wrap.SKLinearSVC:SKLinearSVC', | |||
'classification.linear_discriminant_analysis.SKlearn = sklearn_wrap.SKLinearDiscriminantAnalysis:SKLinearDiscriminantAnalysis', | |||
'classification.passive_aggressive.SKlearn = sklearn_wrap.SKPassiveAggressiveClassifier:SKPassiveAggressiveClassifier', | |||
'classification.gaussian_naive_bayes.SKlearn = sklearn_wrap.SKGaussianNB:SKGaussianNB', | |||
'classification.ada_boost.SKlearn = sklearn_wrap.SKAdaBoostClassifier:SKAdaBoostClassifier', | |||
'classification.random_forest.SKlearn = sklearn_wrap.SKRandomForestClassifier:SKRandomForestClassifier', | |||
'classification.svc.SKlearn = sklearn_wrap.SKSVC:SKSVC', | |||
'classification.multinomial_naive_bayes.SKlearn = sklearn_wrap.SKMultinomialNB:SKMultinomialNB', | |||
'classification.dummy.SKlearn = sklearn_wrap.SKDummyClassifier:SKDummyClassifier', | |||
'classification.extra_trees.SKlearn = sklearn_wrap.SKExtraTreesClassifier:SKExtraTreesClassifier', | |||
'classification.logistic_regression.SKlearn = sklearn_wrap.SKLogisticRegression:SKLogisticRegression', | |||
'classification.bernoulli_naive_bayes.SKlearn = sklearn_wrap.SKBernoulliNB:SKBernoulliNB', | |||
'classification.k_neighbors.SKlearn = sklearn_wrap.SKKNeighborsClassifier:SKKNeighborsClassifier', | |||
'regression.decision_tree.SKlearn = sklearn_wrap.SKDecisionTreeRegressor:SKDecisionTreeRegressor', | |||
'regression.ada_boost.SKlearn = sklearn_wrap.SKAdaBoostRegressor:SKAdaBoostRegressor', | |||
'regression.k_neighbors.SKlearn = sklearn_wrap.SKKNeighborsRegressor:SKKNeighborsRegressor', | |||
'regression.linear.SKlearn = sklearn_wrap.SKLinearRegression:SKLinearRegression', | |||
'regression.bagging.SKlearn = sklearn_wrap.SKBaggingRegressor:SKBaggingRegressor', | |||
'regression.lasso_cv.SKlearn = sklearn_wrap.SKLassoCV:SKLassoCV', | |||
'regression.elastic_net.SKlearn = sklearn_wrap.SKElasticNet:SKElasticNet', | |||
'regression.ard.SKlearn = sklearn_wrap.SKARDRegression:SKARDRegression', | |||
'regression.svr.SKlearn = sklearn_wrap.SKSVR:SKSVR', | |||
'regression.ridge.SKlearn = sklearn_wrap.SKRidge:SKRidge', | |||
'regression.gaussian_process.SKlearn = sklearn_wrap.SKGaussianProcessRegressor:SKGaussianProcessRegressor', | |||
'regression.mlp.SKlearn = sklearn_wrap.SKMLPRegressor:SKMLPRegressor', | |||
'regression.dummy.SKlearn = sklearn_wrap.SKDummyRegressor:SKDummyRegressor', | |||
'regression.sgd.SKlearn = sklearn_wrap.SKSGDRegressor:SKSGDRegressor', | |||
'regression.lasso.SKlearn = sklearn_wrap.SKLasso:SKLasso', | |||
'regression.lars.SKlearn = sklearn_wrap.SKLars:SKLars', | |||
'regression.extra_trees.SKlearn = sklearn_wrap.SKExtraTreesRegressor:SKExtraTreesRegressor', | |||
'regression.linear_svr.SKlearn = sklearn_wrap.SKLinearSVR:SKLinearSVR', | |||
'regression.random_forest.SKlearn = sklearn_wrap.SKRandomForestRegressor:SKRandomForestRegressor', | |||
'regression.gradient_boosting.SKlearn = sklearn_wrap.SKGradientBoostingRegressor:SKGradientBoostingRegressor', | |||
'regression.passive_aggressive.SKlearn = sklearn_wrap.SKPassiveAggressiveRegressor:SKPassiveAggressiveRegressor', | |||
'regression.kernel_ridge.SKlearn = sklearn_wrap.SKKernelRidge:SKKernelRidge', | |||
'data_preprocessing.max_abs_scaler.SKlearn = sklearn_wrap.SKMaxAbsScaler:SKMaxAbsScaler', | |||
'data_preprocessing.normalizer.SKlearn = sklearn_wrap.SKNormalizer:SKNormalizer', | |||
'data_preprocessing.robust_scaler.SKlearn = sklearn_wrap.SKRobustScaler:SKRobustScaler', | |||
'data_preprocessing.tfidf_vectorizer.SKlearn = sklearn_wrap.SKTfidfVectorizer:SKTfidfVectorizer', | |||
'data_transformation.one_hot_encoder.SKlearn = sklearn_wrap.SKOneHotEncoder:SKOneHotEncoder', | |||
'data_preprocessing.truncated_svd.SKlearn = sklearn_wrap.SKTruncatedSVD:SKTruncatedSVD', | |||
'feature_selection.select_percentile.SKlearn = sklearn_wrap.SKSelectPercentile:SKSelectPercentile', | |||
'feature_extraction.pca.SKlearn = sklearn_wrap.SKPCA:SKPCA', | |||
'data_preprocessing.count_vectorizer.SKlearn = sklearn_wrap.SKCountVectorizer:SKCountVectorizer', | |||
'data_transformation.ordinal_encoder.SKlearn = sklearn_wrap.SKOrdinalEncoder:SKOrdinalEncoder', | |||
'data_preprocessing.binarizer.SKlearn = sklearn_wrap.SKBinarizer:SKBinarizer', | |||
'data_cleaning.missing_indicator.SKlearn = sklearn_wrap.SKMissingIndicator:SKMissingIndicator', | |||
'feature_selection.select_fwe.SKlearn = sklearn_wrap.SKSelectFwe:SKSelectFwe', | |||
'data_preprocessing.rbf_sampler.SKlearn = sklearn_wrap.SKRBFSampler:SKRBFSampler', | |||
'data_preprocessing.min_max_scaler.SKlearn = sklearn_wrap.SKMinMaxScaler:SKMinMaxScaler', | |||
'data_preprocessing.random_trees_embedding.SKlearn = sklearn_wrap.SKRandomTreesEmbedding:SKRandomTreesEmbedding', | |||
'data_transformation.gaussian_random_projection.SKlearn = sklearn_wrap.SKGaussianRandomProjection:SKGaussianRandomProjection', | |||
'feature_extraction.kernel_pca.SKlearn = sklearn_wrap.SKKernelPCA:SKKernelPCA', | |||
'data_preprocessing.polynomial_features.SKlearn = sklearn_wrap.SKPolynomialFeatures:SKPolynomialFeatures', | |||
'data_preprocessing.feature_agglomeration.SKlearn = sklearn_wrap.SKFeatureAgglomeration:SKFeatureAgglomeration', | |||
'data_cleaning.imputer.SKlearn = sklearn_wrap.SKImputer:SKImputer', | |||
'data_preprocessing.standard_scaler.SKlearn = sklearn_wrap.SKStandardScaler:SKStandardScaler', | |||
'data_transformation.fast_ica.SKlearn = sklearn_wrap.SKFastICA:SKFastICA', | |||
'data_preprocessing.quantile_transformer.SKlearn = sklearn_wrap.SKQuantileTransformer:SKQuantileTransformer', | |||
'data_transformation.sparse_random_projection.SKlearn = sklearn_wrap.SKSparseRandomProjection:SKSparseRandomProjection', | |||
'data_preprocessing.nystroem.SKlearn = sklearn_wrap.SKNystroem:SKNystroem', | |||
'feature_selection.variance_threshold.SKlearn = sklearn_wrap.SKVarianceThreshold:SKVarianceThreshold', | |||
'feature_selection.generic_univariate_select.SKlearn = sklearn_wrap.SKGenericUnivariateSelect:SKGenericUnivariateSelect', | |||
], | |||
}, | |||
) |
@@ -1,470 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.linear_model.bayes import ARDRegression | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
coef_: Optional[ndarray] | |||
alpha_: Optional[float] | |||
lambda_: Optional[ndarray] | |||
sigma_: Optional[ndarray] | |||
scores_: Optional[Sequence[Any]] | |||
intercept_: Optional[float] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
n_iter = hyperparams.Bounded[int]( | |||
default=300, | |||
lower=0, | |||
upper=None, | |||
description='Maximum number of iterations. Default is 300', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
tol = hyperparams.Bounded[float]( | |||
default=0.001, | |||
lower=0, | |||
upper=None, | |||
description='Stop the algorithm if w has converged. Default is 1.e-3.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
alpha_1 = hyperparams.Hyperparameter[float]( | |||
default=1e-06, | |||
description='Hyper-parameter : shape parameter for the Gamma distribution prior over the alpha parameter. Default is 1.e-6.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
alpha_2 = hyperparams.Hyperparameter[float]( | |||
default=1e-06, | |||
description='Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the alpha parameter. Default is 1.e-6.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
lambda_1 = hyperparams.Hyperparameter[float]( | |||
default=1e-06, | |||
description='Hyper-parameter : shape parameter for the Gamma distribution prior over the lambda parameter. Default is 1.e-6.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
lambda_2 = hyperparams.Hyperparameter[float]( | |||
default=1e-06, | |||
description='Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the lambda parameter. Default is 1.e-6.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
threshold_lambda = hyperparams.Hyperparameter[float]( | |||
default=10000.0, | |||
description='threshold for removing (pruning) weights with high precision from the computation. Default is 1.e+4.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
fit_intercept = hyperparams.UniformBool( | |||
default=True, | |||
description='whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (e.g. data is expected to be already centered). Default is True.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
normalize = hyperparams.UniformBool( | |||
default=False, | |||
description='If True, the regressors X will be normalized before regression. This parameter is ignored when `fit_intercept` is set to False. When the regressors are normalized, note that this makes the hyperparameters learnt more robust and almost independent of the number of samples. The same property is not valid for standardized data. However, if you wish to standardize, please use `preprocessing.StandardScaler` before calling `fit` on an estimator with `normalize=False`. copy_X : boolean, optional, default True. If True, X will be copied; else, it may be overwritten.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKARDRegression(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn ARDRegression | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.BAYESIAN_LINEAR_REGRESSION, ], | |||
"name": "sklearn.linear_model.bayes.ARDRegression", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.ard.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html']}, | |||
"version": "2019.11.13", | |||
"id": "966dd2c4-d439-3ad6-b49f-17706595606c", | |||
"hyperparams_to_tune": ['n_iter'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None, | |||
_copy_X: bool = True, | |||
_verbose: bool = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = ARDRegression( | |||
n_iter=self.hyperparams['n_iter'], | |||
tol=self.hyperparams['tol'], | |||
alpha_1=self.hyperparams['alpha_1'], | |||
alpha_2=self.hyperparams['alpha_2'], | |||
lambda_1=self.hyperparams['lambda_1'], | |||
lambda_2=self.hyperparams['lambda_2'], | |||
threshold_lambda=self.hyperparams['threshold_lambda'], | |||
fit_intercept=self.hyperparams['fit_intercept'], | |||
normalize=self.hyperparams['normalize'], | |||
copy_X=_copy_X, | |||
verbose=_verbose | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
coef_=None, | |||
alpha_=None, | |||
lambda_=None, | |||
sigma_=None, | |||
scores_=None, | |||
intercept_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
coef_=getattr(self._clf, 'coef_', None), | |||
alpha_=getattr(self._clf, 'alpha_', None), | |||
lambda_=getattr(self._clf, 'lambda_', None), | |||
sigma_=getattr(self._clf, 'sigma_', None), | |||
scores_=getattr(self._clf, 'scores_', None), | |||
intercept_=getattr(self._clf, 'intercept_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.coef_ = params['coef_'] | |||
self._clf.alpha_ = params['alpha_'] | |||
self._clf.lambda_ = params['lambda_'] | |||
self._clf.sigma_ = params['sigma_'] | |||
self._clf.scores_ = params['scores_'] | |||
self._clf.intercept_ = params['intercept_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['coef_'] is not None: | |||
self._fitted = True | |||
if params['alpha_'] is not None: | |||
self._fitted = True | |||
if params['lambda_'] is not None: | |||
self._fitted = True | |||
if params['sigma_'] is not None: | |||
self._fitted = True | |||
if params['scores_'] is not None: | |||
self._fitted = True | |||
if params['intercept_'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKARDRegression.__doc__ = ARDRegression.__doc__ |
@@ -1,498 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.ensemble.weight_boosting import AdaBoostClassifier | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
estimators_: Optional[Sequence[sklearn.base.BaseEstimator]] | |||
classes_: Optional[ndarray] | |||
n_classes_: Optional[int] | |||
estimator_weights_: Optional[ndarray] | |||
estimator_errors_: Optional[ndarray] | |||
base_estimator_: Optional[object] | |||
estimator_params: Optional[tuple] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
base_estimator = hyperparams.Constant( | |||
default=None, | |||
description='The base estimator from which the boosted ensemble is built. Support for sample weighting is required, as well as proper `classes_` and `n_classes_` attributes.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
n_estimators = hyperparams.Bounded[int]( | |||
lower=1, | |||
upper=None, | |||
default=50, | |||
description='The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
learning_rate = hyperparams.Uniform( | |||
lower=0.01, | |||
upper=2, | |||
default=0.1, | |||
description='Learning rate shrinks the contribution of each classifier by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
algorithm = hyperparams.Enumeration[str]( | |||
values=['SAMME.R', 'SAMME'], | |||
default='SAMME.R', | |||
description='If \'SAMME.R\' then use the SAMME.R real boosting algorithm. ``base_estimator`` must support calculation of class probabilities. If \'SAMME\' then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKAdaBoostClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], | |||
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn AdaBoostClassifier | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ADABOOST, ], | |||
"name": "sklearn.ensemble.weight_boosting.AdaBoostClassifier", | |||
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, | |||
"python_path": "d3m.primitives.classification.ada_boost.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html']}, | |||
"version": "2019.11.13", | |||
"id": "4210a6a6-14ab-4490-a7dc-460763e70e55", | |||
"hyperparams_to_tune": ['learning_rate', 'n_estimators'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = AdaBoostClassifier( | |||
base_estimator=self.hyperparams['base_estimator'], | |||
n_estimators=self.hyperparams['n_estimators'], | |||
learning_rate=self.hyperparams['learning_rate'], | |||
algorithm=self.hyperparams['algorithm'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
estimators_=None, | |||
classes_=None, | |||
n_classes_=None, | |||
estimator_weights_=None, | |||
estimator_errors_=None, | |||
base_estimator_=None, | |||
estimator_params=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
estimators_=getattr(self._clf, 'estimators_', None), | |||
classes_=getattr(self._clf, 'classes_', None), | |||
n_classes_=getattr(self._clf, 'n_classes_', None), | |||
estimator_weights_=getattr(self._clf, 'estimator_weights_', None), | |||
estimator_errors_=getattr(self._clf, 'estimator_errors_', None), | |||
base_estimator_=getattr(self._clf, 'base_estimator_', None), | |||
estimator_params=getattr(self._clf, 'estimator_params', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.estimators_ = params['estimators_'] | |||
self._clf.classes_ = params['classes_'] | |||
self._clf.n_classes_ = params['n_classes_'] | |||
self._clf.estimator_weights_ = params['estimator_weights_'] | |||
self._clf.estimator_errors_ = params['estimator_errors_'] | |||
self._clf.base_estimator_ = params['base_estimator_'] | |||
self._clf.estimator_params = params['estimator_params'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['estimators_'] is not None: | |||
self._fitted = True | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
if params['n_classes_'] is not None: | |||
self._fitted = True | |||
if params['estimator_weights_'] is not None: | |||
self._fitted = True | |||
if params['estimator_errors_'] is not None: | |||
self._fitted = True | |||
if params['base_estimator_'] is not None: | |||
self._fitted = True | |||
if params['estimator_params'] is not None: | |||
self._fitted = True | |||
def log_likelihoods(self, *, | |||
outputs: Outputs, | |||
inputs: Inputs, | |||
timeout: float = None, | |||
iterations: int = None) -> CallResult[Sequence[float]]: | |||
inputs = inputs.iloc[:, self._training_indices] # Get ndarray | |||
outputs = outputs.iloc[:, self._target_column_indices] | |||
if len(inputs.columns) and len(outputs.columns): | |||
if outputs.shape[1] != self._clf.n_outputs_: | |||
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") | |||
log_proba = self._clf.predict_log_proba(inputs) | |||
# Making it always a list, even when only one target. | |||
if self._clf.n_outputs_ == 1: | |||
log_proba = [log_proba] | |||
classes = [self._clf.classes_] | |||
else: | |||
classes = self._clf.classes_ | |||
samples_length = inputs.shape[0] | |||
log_likelihoods = [] | |||
for k in range(self._clf.n_outputs_): | |||
# We have to map each class to its internal (numerical) index used in the learner. | |||
# This allows "outputs" to contain string classes. | |||
outputs_column = outputs.iloc[:, k] | |||
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) | |||
mapped_outputs_column = outputs_column.map(classes_map) | |||
# For each target column (column in "outputs"), for each sample (row) we pick the log | |||
# likelihood for a given class. | |||
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) | |||
results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) | |||
results.columns = outputs.columns | |||
for k in range(self._clf.n_outputs_): | |||
column_metadata = outputs.metadata.query_column(k) | |||
if 'name' in column_metadata: | |||
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) | |||
else: | |||
results = d3m_dataframe(generate_metadata=True) | |||
return CallResult(results) | |||
def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: | |||
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) | |||
output.columns = self._input_column_names | |||
for i in range(len(self._input_column_names)): | |||
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) | |||
return CallResult(output) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKAdaBoostClassifier.__doc__ = AdaBoostClassifier.__doc__ |
@@ -1,437 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.ensemble.weight_boosting import AdaBoostRegressor | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
estimators_: Optional[List[sklearn.tree.DecisionTreeRegressor]] | |||
estimator_weights_: Optional[ndarray] | |||
estimator_errors_: Optional[ndarray] | |||
estimator_params: Optional[tuple] | |||
base_estimator_: Optional[object] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
base_estimator = hyperparams.Constant( | |||
default=None, | |||
description='The base estimator from which the boosted ensemble is built. Support for sample weighting is required.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
n_estimators = hyperparams.Bounded[int]( | |||
lower=1, | |||
upper=None, | |||
default=50, | |||
description='The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
learning_rate = hyperparams.Uniform( | |||
lower=0.01, | |||
upper=2, | |||
default=0.1, | |||
description='Learning rate shrinks the contribution of each regressor by ``learning_rate``. There is a trade-off between ``learning_rate`` and ``n_estimators``.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
loss = hyperparams.Enumeration[str]( | |||
values=['linear', 'square', 'exponential'], | |||
default='linear', | |||
description='The loss function to use when updating the weights after each boosting iteration.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKAdaBoostRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn AdaBoostRegressor | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ADABOOST, ], | |||
"name": "sklearn.ensemble.weight_boosting.AdaBoostRegressor", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.ada_boost.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html']}, | |||
"version": "2019.11.13", | |||
"id": "6cab1537-02e1-4dc4-9ebb-53fa2cbabedd", | |||
"hyperparams_to_tune": ['learning_rate', 'n_estimators'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = AdaBoostRegressor( | |||
base_estimator=self.hyperparams['base_estimator'], | |||
n_estimators=self.hyperparams['n_estimators'], | |||
learning_rate=self.hyperparams['learning_rate'], | |||
loss=self.hyperparams['loss'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
estimators_=None, | |||
estimator_weights_=None, | |||
estimator_errors_=None, | |||
estimator_params=None, | |||
base_estimator_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
estimators_=getattr(self._clf, 'estimators_', None), | |||
estimator_weights_=getattr(self._clf, 'estimator_weights_', None), | |||
estimator_errors_=getattr(self._clf, 'estimator_errors_', None), | |||
estimator_params=getattr(self._clf, 'estimator_params', None), | |||
base_estimator_=getattr(self._clf, 'base_estimator_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.estimators_ = params['estimators_'] | |||
self._clf.estimator_weights_ = params['estimator_weights_'] | |||
self._clf.estimator_errors_ = params['estimator_errors_'] | |||
self._clf.estimator_params = params['estimator_params'] | |||
self._clf.base_estimator_ = params['base_estimator_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['estimators_'] is not None: | |||
self._fitted = True | |||
if params['estimator_weights_'] is not None: | |||
self._fitted = True | |||
if params['estimator_errors_'] is not None: | |||
self._fitted = True | |||
if params['estimator_params'] is not None: | |||
self._fitted = True | |||
if params['base_estimator_'] is not None: | |||
self._fitted = True | |||
def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: | |||
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) | |||
output.columns = self._input_column_names | |||
for i in range(len(self._input_column_names)): | |||
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) | |||
return CallResult(output) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKAdaBoostRegressor.__doc__ = AdaBoostRegressor.__doc__ |
@@ -1,589 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.ensemble.bagging import BaggingClassifier | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
base_estimator_: Optional[object] | |||
estimators_: Optional[List[sklearn.tree.DecisionTreeClassifier]] | |||
estimators_features_: Optional[List[ndarray]] | |||
classes_: Optional[ndarray] | |||
n_classes_: Optional[int] | |||
oob_score_: Optional[float] | |||
oob_decision_function_: Optional[List[ndarray]] | |||
n_features_: Optional[int] | |||
_max_features: Optional[int] | |||
_max_samples: Optional[int] | |||
_n_samples: Optional[int] | |||
_seeds: Optional[ndarray] | |||
estimator_params: Optional[tuple] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
n_estimators = hyperparams.Bounded[int]( | |||
default=10, | |||
lower=1, | |||
upper=None, | |||
description='The number of base estimators in the ensemble.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_samples = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=1.0, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='percent', | |||
description='The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_features = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=1.0, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='percent', | |||
description='The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
bootstrap = hyperparams.Enumeration[str]( | |||
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], | |||
default='bootstrap', | |||
description='Whether bootstrap samples are used when building trees.' | |||
' And whether to use out-of-bag samples to estimate the generalization accuracy.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
bootstrap_features = hyperparams.UniformBool( | |||
default=False, | |||
description='Whether features are drawn with replacement.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
warm_start = hyperparams.UniformBool( | |||
default=False, | |||
description='When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. .. versionadded:: 0.17 *warm_start* constructor parameter.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
n_jobs = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'limit': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'all_cores': hyperparams.Constant( | |||
default=-1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='limit', | |||
description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKBaggingClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], | |||
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn BaggingClassifier | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ENSEMBLE_LEARNING, ], | |||
"name": "sklearn.ensemble.bagging.BaggingClassifier", | |||
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, | |||
"python_path": "d3m.primitives.classification.bagging.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html']}, | |||
"version": "2019.11.13", | |||
"id": "1b2a32a6-0ec5-3ca0-9386-b8b1f1b831d1", | |||
"hyperparams_to_tune": ['n_estimators', 'max_samples', 'max_features'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None, | |||
_verbose: int = 0) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = BaggingClassifier( | |||
n_estimators=self.hyperparams['n_estimators'], | |||
max_samples=self.hyperparams['max_samples'], | |||
max_features=self.hyperparams['max_features'], | |||
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], | |||
bootstrap_features=self.hyperparams['bootstrap_features'], | |||
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], | |||
warm_start=self.hyperparams['warm_start'], | |||
n_jobs=self.hyperparams['n_jobs'], | |||
random_state=self.random_seed, | |||
verbose=_verbose | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
base_estimator_=None, | |||
estimators_=None, | |||
estimators_features_=None, | |||
classes_=None, | |||
n_classes_=None, | |||
oob_score_=None, | |||
oob_decision_function_=None, | |||
n_features_=None, | |||
_max_features=None, | |||
_max_samples=None, | |||
_n_samples=None, | |||
_seeds=None, | |||
estimator_params=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
base_estimator_=getattr(self._clf, 'base_estimator_', None), | |||
estimators_=getattr(self._clf, 'estimators_', None), | |||
estimators_features_=getattr(self._clf, 'estimators_features_', None), | |||
classes_=getattr(self._clf, 'classes_', None), | |||
n_classes_=getattr(self._clf, 'n_classes_', None), | |||
oob_score_=getattr(self._clf, 'oob_score_', None), | |||
oob_decision_function_=getattr(self._clf, 'oob_decision_function_', None), | |||
n_features_=getattr(self._clf, 'n_features_', None), | |||
_max_features=getattr(self._clf, '_max_features', None), | |||
_max_samples=getattr(self._clf, '_max_samples', None), | |||
_n_samples=getattr(self._clf, '_n_samples', None), | |||
_seeds=getattr(self._clf, '_seeds', None), | |||
estimator_params=getattr(self._clf, 'estimator_params', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.base_estimator_ = params['base_estimator_'] | |||
self._clf.estimators_ = params['estimators_'] | |||
self._clf.estimators_features_ = params['estimators_features_'] | |||
self._clf.classes_ = params['classes_'] | |||
self._clf.n_classes_ = params['n_classes_'] | |||
self._clf.oob_score_ = params['oob_score_'] | |||
self._clf.oob_decision_function_ = params['oob_decision_function_'] | |||
self._clf.n_features_ = params['n_features_'] | |||
self._clf._max_features = params['_max_features'] | |||
self._clf._max_samples = params['_max_samples'] | |||
self._clf._n_samples = params['_n_samples'] | |||
self._clf._seeds = params['_seeds'] | |||
self._clf.estimator_params = params['estimator_params'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['base_estimator_'] is not None: | |||
self._fitted = True | |||
if params['estimators_'] is not None: | |||
self._fitted = True | |||
if params['estimators_features_'] is not None: | |||
self._fitted = True | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
if params['n_classes_'] is not None: | |||
self._fitted = True | |||
if params['oob_score_'] is not None: | |||
self._fitted = True | |||
if params['oob_decision_function_'] is not None: | |||
self._fitted = True | |||
if params['n_features_'] is not None: | |||
self._fitted = True | |||
if params['_max_features'] is not None: | |||
self._fitted = True | |||
if params['_max_samples'] is not None: | |||
self._fitted = True | |||
if params['_n_samples'] is not None: | |||
self._fitted = True | |||
if params['_seeds'] is not None: | |||
self._fitted = True | |||
if params['estimator_params'] is not None: | |||
self._fitted = True | |||
def log_likelihoods(self, *, | |||
outputs: Outputs, | |||
inputs: Inputs, | |||
timeout: float = None, | |||
iterations: int = None) -> CallResult[Sequence[float]]: | |||
inputs = inputs.iloc[:, self._training_indices] # Get ndarray | |||
outputs = outputs.iloc[:, self._target_column_indices] | |||
if len(inputs.columns) and len(outputs.columns): | |||
if outputs.shape[1] != self._clf.n_outputs_: | |||
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") | |||
log_proba = self._clf.predict_log_proba(inputs) | |||
# Making it always a list, even when only one target. | |||
if self._clf.n_outputs_ == 1: | |||
log_proba = [log_proba] | |||
classes = [self._clf.classes_] | |||
else: | |||
classes = self._clf.classes_ | |||
samples_length = inputs.shape[0] | |||
log_likelihoods = [] | |||
for k in range(self._clf.n_outputs_): | |||
# We have to map each class to its internal (numerical) index used in the learner. | |||
# This allows "outputs" to contain string classes. | |||
outputs_column = outputs.iloc[:, k] | |||
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) | |||
mapped_outputs_column = outputs_column.map(classes_map) | |||
# For each target column (column in "outputs"), for each sample (row) we pick the log | |||
# likelihood for a given class. | |||
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) | |||
results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) | |||
results.columns = outputs.columns | |||
for k in range(self._clf.n_outputs_): | |||
column_metadata = outputs.metadata.query_column(k) | |||
if 'name' in column_metadata: | |||
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) | |||
else: | |||
results = d3m_dataframe(generate_metadata=True) | |||
return CallResult(results) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKBaggingClassifier.__doc__ = BaggingClassifier.__doc__ |
@@ -1,533 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.ensemble.bagging import BaggingRegressor | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
estimators_: Optional[List[sklearn.tree.DecisionTreeRegressor]] | |||
estimators_features_: Optional[List[ndarray]] | |||
oob_score_: Optional[float] | |||
oob_prediction_: Optional[ndarray] | |||
base_estimator_: Optional[object] | |||
n_features_: Optional[int] | |||
_max_features: Optional[int] | |||
_max_samples: Optional[int] | |||
_n_samples: Optional[int] | |||
_seeds: Optional[ndarray] | |||
estimator_params: Optional[tuple] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
base_estimator = hyperparams.Constant( | |||
default=None, | |||
description='The base estimator to fit on random subsets of the dataset. If None, then the base estimator is a decision tree.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
n_estimators = hyperparams.Bounded[int]( | |||
default=10, | |||
lower=1, | |||
upper=None, | |||
description='The number of base estimators in the ensemble.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_samples = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=1.0, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='percent', | |||
description='The number of samples to draw from X to train each base estimator. - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_features = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=1.0, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='percent', | |||
description='The number of features to draw from X to train each base estimator. - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
bootstrap = hyperparams.Enumeration[str]( | |||
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], | |||
default='bootstrap', | |||
description='Whether bootstrap samples are used when building trees.' | |||
' And whether to use out-of-bag samples to estimate the generalization accuracy.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
bootstrap_features = hyperparams.UniformBool( | |||
default=False, | |||
description='Whether features are drawn with replacement.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
warm_start = hyperparams.UniformBool( | |||
default=False, | |||
description='When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new ensemble. See :term:`the Glossary <warm_start>`.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
n_jobs = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'limit': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'all_cores': hyperparams.Constant( | |||
default=-1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='limit', | |||
description='The number of jobs to run in parallel for both `fit` and `predict`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKBaggingRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn BaggingRegressor | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ENSEMBLE_LEARNING, ], | |||
"name": "sklearn.ensemble.bagging.BaggingRegressor", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.bagging.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html']}, | |||
"version": "2019.11.13", | |||
"id": "0dbc4b6d-aa57-4f11-ab18-36125880151b", | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None, | |||
_verbose: int = 0) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = BaggingRegressor( | |||
base_estimator=self.hyperparams['base_estimator'], | |||
n_estimators=self.hyperparams['n_estimators'], | |||
max_samples=self.hyperparams['max_samples'], | |||
max_features=self.hyperparams['max_features'], | |||
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], | |||
bootstrap_features=self.hyperparams['bootstrap_features'], | |||
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], | |||
warm_start=self.hyperparams['warm_start'], | |||
n_jobs=self.hyperparams['n_jobs'], | |||
random_state=self.random_seed, | |||
verbose=_verbose | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
estimators_=None, | |||
estimators_features_=None, | |||
oob_score_=None, | |||
oob_prediction_=None, | |||
base_estimator_=None, | |||
n_features_=None, | |||
_max_features=None, | |||
_max_samples=None, | |||
_n_samples=None, | |||
_seeds=None, | |||
estimator_params=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
estimators_=getattr(self._clf, 'estimators_', None), | |||
estimators_features_=getattr(self._clf, 'estimators_features_', None), | |||
oob_score_=getattr(self._clf, 'oob_score_', None), | |||
oob_prediction_=getattr(self._clf, 'oob_prediction_', None), | |||
base_estimator_=getattr(self._clf, 'base_estimator_', None), | |||
n_features_=getattr(self._clf, 'n_features_', None), | |||
_max_features=getattr(self._clf, '_max_features', None), | |||
_max_samples=getattr(self._clf, '_max_samples', None), | |||
_n_samples=getattr(self._clf, '_n_samples', None), | |||
_seeds=getattr(self._clf, '_seeds', None), | |||
estimator_params=getattr(self._clf, 'estimator_params', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.estimators_ = params['estimators_'] | |||
self._clf.estimators_features_ = params['estimators_features_'] | |||
self._clf.oob_score_ = params['oob_score_'] | |||
self._clf.oob_prediction_ = params['oob_prediction_'] | |||
self._clf.base_estimator_ = params['base_estimator_'] | |||
self._clf.n_features_ = params['n_features_'] | |||
self._clf._max_features = params['_max_features'] | |||
self._clf._max_samples = params['_max_samples'] | |||
self._clf._n_samples = params['_n_samples'] | |||
self._clf._seeds = params['_seeds'] | |||
self._clf.estimator_params = params['estimator_params'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['estimators_'] is not None: | |||
self._fitted = True | |||
if params['estimators_features_'] is not None: | |||
self._fitted = True | |||
if params['oob_score_'] is not None: | |||
self._fitted = True | |||
if params['oob_prediction_'] is not None: | |||
self._fitted = True | |||
if params['base_estimator_'] is not None: | |||
self._fitted = True | |||
if params['n_features_'] is not None: | |||
self._fitted = True | |||
if params['_max_features'] is not None: | |||
self._fitted = True | |||
if params['_max_samples'] is not None: | |||
self._fitted = True | |||
if params['_n_samples'] is not None: | |||
self._fitted = True | |||
if params['_seeds'] is not None: | |||
self._fitted = True | |||
if params['estimator_params'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKBaggingRegressor.__doc__ = BaggingRegressor.__doc__ |
@@ -1,508 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.naive_bayes import BernoulliNB | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
class_log_prior_: Optional[ndarray] | |||
feature_log_prob_: Optional[ndarray] | |||
class_count_: Optional[ndarray] | |||
feature_count_: Optional[ndarray] | |||
classes_: Optional[ndarray] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
alpha = hyperparams.Bounded[float]( | |||
default=1, | |||
lower=0, | |||
upper=None, | |||
description='Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
binarize = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'float': hyperparams.Bounded[float]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='float', | |||
description='Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
fit_prior = hyperparams.UniformBool( | |||
default=True, | |||
description='Whether to learn class prior probabilities or not. If false, a uniform prior will be used.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKBernoulliNB(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], | |||
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], | |||
ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn BernoulliNB | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER, ], | |||
"name": "sklearn.naive_bayes.BernoulliNB", | |||
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, | |||
"python_path": "d3m.primitives.classification.bernoulli_naive_bayes.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html']}, | |||
"version": "2019.11.13", | |||
"id": "dfb1004e-02ac-3399-ba57-8a95639312cd", | |||
"hyperparams_to_tune": ['alpha', 'binarize', 'fit_prior'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = BernoulliNB( | |||
alpha=self.hyperparams['alpha'], | |||
binarize=self.hyperparams['binarize'], | |||
fit_prior=self.hyperparams['fit_prior'], | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._training_inputs is None or self._training_outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.partial_fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
class_log_prior_=None, | |||
feature_log_prob_=None, | |||
class_count_=None, | |||
feature_count_=None, | |||
classes_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
class_log_prior_=getattr(self._clf, 'class_log_prior_', None), | |||
feature_log_prob_=getattr(self._clf, 'feature_log_prob_', None), | |||
class_count_=getattr(self._clf, 'class_count_', None), | |||
feature_count_=getattr(self._clf, 'feature_count_', None), | |||
classes_=getattr(self._clf, 'classes_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.class_log_prior_ = params['class_log_prior_'] | |||
self._clf.feature_log_prob_ = params['feature_log_prob_'] | |||
self._clf.class_count_ = params['class_count_'] | |||
self._clf.feature_count_ = params['feature_count_'] | |||
self._clf.classes_ = params['classes_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['class_log_prior_'] is not None: | |||
self._fitted = True | |||
if params['feature_log_prob_'] is not None: | |||
self._fitted = True | |||
if params['class_count_'] is not None: | |||
self._fitted = True | |||
if params['feature_count_'] is not None: | |||
self._fitted = True | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
def log_likelihoods(self, *, | |||
outputs: Outputs, | |||
inputs: Inputs, | |||
timeout: float = None, | |||
iterations: int = None) -> CallResult[Sequence[float]]: | |||
inputs = inputs.iloc[:, self._training_indices] # Get ndarray | |||
outputs = outputs.iloc[:, self._target_column_indices] | |||
if len(inputs.columns) and len(outputs.columns): | |||
if outputs.shape[1] != self._clf.n_outputs_: | |||
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") | |||
log_proba = self._clf.predict_log_proba(inputs) | |||
# Making it always a list, even when only one target. | |||
if self._clf.n_outputs_ == 1: | |||
log_proba = [log_proba] | |||
classes = [self._clf.classes_] | |||
else: | |||
classes = self._clf.classes_ | |||
samples_length = inputs.shape[0] | |||
log_likelihoods = [] | |||
for k in range(self._clf.n_outputs_): | |||
# We have to map each class to its internal (numerical) index used in the learner. | |||
# This allows "outputs" to contain string classes. | |||
outputs_column = outputs.iloc[:, k] | |||
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) | |||
mapped_outputs_column = outputs_column.map(classes_map) | |||
# For each target column (column in "outputs"), for each sample (row) we pick the log | |||
# likelihood for a given class. | |||
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) | |||
results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) | |||
results.columns = outputs.columns | |||
for k in range(self._clf.n_outputs_): | |||
column_metadata = outputs.metadata.query_column(k) | |||
if 'name' in column_metadata: | |||
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) | |||
else: | |||
results = d3m_dataframe(generate_metadata=True) | |||
return CallResult(results) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKBernoulliNB.__doc__ = BernoulliNB.__doc__ |
@@ -1,330 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.preprocessing.data import Binarizer | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
threshold = hyperparams.Bounded[float]( | |||
default=0.0, | |||
lower=0.0, | |||
upper=None, | |||
description='Feature values below or equal to this are replaced by 0, above it by 1. Threshold may not be less than 0 for operations on sparse matrices.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||
description='Decides what semantic type to attach to generated attributes', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKBinarizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn Binarizer | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.FEATURE_SCALING, ], | |||
"name": "sklearn.preprocessing.data.Binarizer", | |||
"primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, | |||
"python_path": "d3m.primitives.data_preprocessing.binarizer.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Binarizer.html']}, | |||
"version": "2019.11.13", | |||
"id": "13777068-9dc0-3c5b-b4da-99350d67ee3f", | |||
"hyperparams_to_tune": ['threshold'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = Binarizer( | |||
threshold=self.hyperparams['threshold'], | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
def set_training_data(self, *, inputs: Inputs) -> None: | |||
self._inputs = inputs | |||
self._fitted = False | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._fitted: | |||
return CallResult(None) | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if self._training_inputs is None: | |||
return CallResult(None) | |||
if len(self._training_indices) > 0: | |||
self._clf.fit(self._training_inputs) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
sk_inputs = inputs | |||
if self.hyperparams['use_semantic_types']: | |||
sk_inputs = inputs.iloc[:, self._training_indices] | |||
output_columns = [] | |||
if len(self._training_indices) > 0: | |||
sk_output = self._clf.transform(sk_inputs) | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
outputs = self._wrap_predictions(inputs, sk_output) | |||
if len(outputs.columns) == len(self._input_column_names): | |||
outputs.columns = self._input_column_names | |||
output_columns = [outputs] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._training_indices, | |||
columns_list=output_columns) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_columns'], | |||
exclude_columns=hyperparams['exclude_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = [] | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||
target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], | |||
outputs_metadata: metadata_base.DataMetadata, hyperparams): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in input_indices: | |||
column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = set() | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
# If outputs has more columns than index, add Attribute Type to all remaining | |||
if outputs_length > len(input_indices): | |||
for column_index in range(len(input_indices), outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = set() | |||
semantic_types.add(hyperparams["return_semantic_type"]) | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKBinarizer.__doc__ = Binarizer.__doc__ |
@@ -1,490 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.feature_extraction.text import CountVectorizer | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||
from d3m.metadata.base import ALL_ELEMENTS | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
vocabulary_: Optional[Sequence[dict]] | |||
stop_words_: Optional[Sequence[set]] | |||
fixed_vocabulary_: Optional[Sequence[bool]] | |||
_stop_words_id: Optional[Sequence[int]] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
strip_accents = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'accents': hyperparams.Enumeration[str]( | |||
default='ascii', | |||
values=['ascii', 'unicode'], | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Remove accents during the preprocessing step. \'ascii\' is a fast method that only works on characters that have an direct ASCII mapping. \'unicode\' is a slightly slower method that works on any characters. None (default) does nothing.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
analyzer = hyperparams.Enumeration[str]( | |||
default='word', | |||
values=['word', 'char', 'char_wb'], | |||
description='Whether the feature should be made of word or character n-grams. Option \'char_wb\' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
ngram_range = hyperparams.SortedList( | |||
elements=hyperparams.Bounded[int](1, None, 1), | |||
default=(1, 1), | |||
min_size=2, | |||
max_size=2, | |||
description='The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
stop_words = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'string': hyperparams.Hyperparameter[str]( | |||
default='english', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'list': hyperparams.List( | |||
elements=hyperparams.Hyperparameter[str](''), | |||
default=[], | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='If \'english\', a built-in stop word list for English is used. If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens. Only applies if ``analyzer == \'word\'``. If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
lowercase = hyperparams.UniformBool( | |||
default=True, | |||
description='Convert all characters to lowercase before tokenizing.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
token_pattern = hyperparams.Hyperparameter[str]( | |||
default='(?u)\\b\w\w+\\b', | |||
description='Regular expression denoting what constitutes a "token", only used if ``analyzer == \'word\'``. The default regexp select tokens of 2 or more alphanumeric characters (punctuation is completely ignored and always treated as a token separator).', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_df = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'proportion': hyperparams.Bounded[float]( | |||
default=1.0, | |||
lower=0.0, | |||
upper=1.0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'absolute': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='proportion', | |||
description='When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_df = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'proportion': hyperparams.Bounded[float]( | |||
default=1.0, | |||
lower=0.0, | |||
upper=1.0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'absolute': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='absolute', | |||
description='When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_features = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus. This parameter is ignored if vocabulary is not None.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
binary = hyperparams.UniformBool( | |||
default=False, | |||
description='If True, all non zero counts are set to 1. This is useful for discrete probabilistic models that model binary events rather than integer counts.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
class SKCountVectorizer(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn CountVectorizer | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.CountVectorizer.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.MINIMUM_REDUNDANCY_FEATURE_SELECTION, ], | |||
"name": "sklearn.feature_extraction.text.CountVectorizer", | |||
"primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, | |||
"python_path": "d3m.primitives.data_preprocessing.count_vectorizer.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.CountVectorizer.html']}, | |||
"version": "2019.11.13", | |||
"id": "0609859b-8ed9-397f-ac7a-7c4f63863560", | |||
"hyperparams_to_tune": ['max_df', 'min_df'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# True | |||
self._clf = list() | |||
self._training_inputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._fitted = False | |||
def set_training_data(self, *, inputs: Inputs) -> None: | |||
self._inputs = inputs | |||
self._fitted = False | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._fitted: | |||
return CallResult(None) | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
if self._training_inputs is None: | |||
raise ValueError("Missing training data.") | |||
if len(self._training_indices) > 0: | |||
for column_index in range(len(self._training_inputs.columns)): | |||
clf = self._create_new_sklearn_estimator() | |||
clf.fit(self._training_inputs.iloc[:, column_index]) | |||
self._clf.append(clf) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
sk_inputs = inputs | |||
if self.hyperparams['use_semantic_types']: | |||
sk_inputs, training_indices = self._get_columns_to_fit(inputs, self.hyperparams) | |||
else: | |||
training_indices = list(range(len(inputs))) | |||
# Iterating over all estimators and call transform on them. | |||
# No. of estimators should be equal to the number of columns in the input | |||
if len(self._clf) != len(sk_inputs.columns): | |||
raise RuntimeError("Input data does not have the same number of columns as training data") | |||
outputs = [] | |||
if len(self._training_indices) > 0: | |||
for column_index in range(len(sk_inputs.columns)): | |||
clf = self._clf[column_index] | |||
output = clf.transform(sk_inputs.iloc[:, column_index]) | |||
column_name = sk_inputs.columns[column_index] | |||
if sparse.issparse(output): | |||
output = output.toarray() | |||
output = self._wrap_predictions(inputs, output) | |||
# Updating column names. | |||
output.columns = map(lambda x: "{}_{}".format(column_name, x), clf.get_feature_names()) | |||
for i, name in enumerate(clf.get_feature_names()): | |||
output.metadata = output.metadata.update((ALL_ELEMENTS, i), {'name': name}) | |||
outputs.append(output) | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._training_indices, | |||
columns_list=outputs) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
vocabulary_=None, | |||
stop_words_=None, | |||
fixed_vocabulary_=None, | |||
_stop_words_id=None, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names | |||
) | |||
return Params( | |||
vocabulary_=list(map(lambda clf: getattr(clf, 'vocabulary_', None), self._clf)), | |||
stop_words_=list(map(lambda clf: getattr(clf, 'stop_words_', None), self._clf)), | |||
fixed_vocabulary_=list(map(lambda clf: getattr(clf, 'fixed_vocabulary_', None), self._clf)), | |||
_stop_words_id=list(map(lambda clf: getattr(clf, '_stop_words_id', None), self._clf)), | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
for param, val in params.items(): | |||
if val is not None and param not in ['target_names_', 'training_indices_']: | |||
self._clf = list(map(lambda x: self._create_new_sklearn_estimator(), val)) | |||
break | |||
for index in range(len(self._clf)): | |||
for param, val in params.items(): | |||
if val is not None: | |||
setattr(self._clf[index], param, val[index]) | |||
else: | |||
setattr(self._clf[index], param, None) | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._fitted = False | |||
if params['vocabulary_'] is not None: | |||
self._fitted = True | |||
if params['stop_words_'] is not None: | |||
self._fitted = True | |||
if params['fixed_vocabulary_'] is not None: | |||
self._fitted = True | |||
if params['_stop_words_id'] is not None: | |||
self._fitted = True | |||
def _create_new_sklearn_estimator(self): | |||
clf = CountVectorizer( | |||
strip_accents=self.hyperparams['strip_accents'], | |||
analyzer=self.hyperparams['analyzer'], | |||
ngram_range=self.hyperparams['ngram_range'], | |||
stop_words=self.hyperparams['stop_words'], | |||
lowercase=self.hyperparams['lowercase'], | |||
token_pattern=self.hyperparams['token_pattern'], | |||
max_df=self.hyperparams['max_df'], | |||
min_df=self.hyperparams['min_df'], | |||
max_features=self.hyperparams['max_features'], | |||
binary=self.hyperparams['binary'], | |||
) | |||
return clf | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_columns'], | |||
exclude_columns=hyperparams['exclude_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (str,) | |||
accepted_semantic_types = set(["http://schema.org/Text",]) | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), [] | |||
target_names = [] | |||
target_semantic_type = [] | |||
target_column_indices = [] | |||
metadata = data.metadata | |||
target_column_indices.extend(metadata.get_columns_with_semantic_type('https://metadata.datadrivendiscovery.org/types/TrueTarget')) | |||
for column_index in target_column_indices: | |||
if column_index is metadata_base.ALL_ELEMENTS: | |||
continue | |||
column_index = typing.cast(metadata_base.SimpleSelectorSegment, column_index) | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
target_names.append(column_metadata.get('name', str(column_index))) | |||
target_semantic_type.append(column_metadata.get('semantic_types', [])) | |||
targets = data.iloc[:, target_column_indices] | |||
return targets, target_names, target_semantic_type | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = [] | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/Attribute') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKCountVectorizer.__doc__ = CountVectorizer.__doc__ |
@@ -1,621 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.tree.tree import DecisionTreeClassifier | |||
import numpy | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
classes_: Optional[Union[ndarray, List[ndarray]]] | |||
max_features_: Optional[int] | |||
n_classes_: Optional[Union[numpy.int64, List[numpy.int64]]] | |||
n_features_: Optional[int] | |||
n_outputs_: Optional[int] | |||
tree_: Optional[object] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
criterion = hyperparams.Enumeration[str]( | |||
values=['gini', 'entropy'], | |||
default='gini', | |||
description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
splitter = hyperparams.Enumeration[str]( | |||
values=['best', 'random'], | |||
default='best', | |||
description='The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_depth = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
default=10, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_split = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
default=2, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='absolute', | |||
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_leaf = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=0.5, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='absolute', | |||
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_weight_fraction_leaf = hyperparams.Bounded[float]( | |||
default=0, | |||
lower=0, | |||
upper=0.5, | |||
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_leaf_nodes = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_features = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'specified_int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'calculated': hyperparams.Enumeration[str]( | |||
values=['auto', 'sqrt', 'log2'], | |||
default='auto', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_impurity_decrease = hyperparams.Bounded[float]( | |||
default=0.0, | |||
lower=0.0, | |||
upper=None, | |||
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
class_weight = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'str': hyperparams.Constant( | |||
default='balanced', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
presort = hyperparams.UniformBool( | |||
default=False, | |||
description='Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large datasets, setting this to true may slow down the training process. When using either a smaller dataset or a restricted depth, this may speed up the training.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKDecisionTreeClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], | |||
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn DecisionTreeClassifier | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], | |||
"name": "sklearn.tree.tree.DecisionTreeClassifier", | |||
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, | |||
"python_path": "d3m.primitives.classification.decision_tree.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html']}, | |||
"version": "2019.11.13", | |||
"id": "e20d003d-6a9f-35b0-b4b5-20e42b30282a", | |||
"hyperparams_to_tune": ['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = DecisionTreeClassifier( | |||
criterion=self.hyperparams['criterion'], | |||
splitter=self.hyperparams['splitter'], | |||
max_depth=self.hyperparams['max_depth'], | |||
min_samples_split=self.hyperparams['min_samples_split'], | |||
min_samples_leaf=self.hyperparams['min_samples_leaf'], | |||
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], | |||
max_leaf_nodes=self.hyperparams['max_leaf_nodes'], | |||
max_features=self.hyperparams['max_features'], | |||
min_impurity_decrease=self.hyperparams['min_impurity_decrease'], | |||
class_weight=self.hyperparams['class_weight'], | |||
presort=self.hyperparams['presort'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
classes_=None, | |||
max_features_=None, | |||
n_classes_=None, | |||
n_features_=None, | |||
n_outputs_=None, | |||
tree_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
classes_=getattr(self._clf, 'classes_', None), | |||
max_features_=getattr(self._clf, 'max_features_', None), | |||
n_classes_=getattr(self._clf, 'n_classes_', None), | |||
n_features_=getattr(self._clf, 'n_features_', None), | |||
n_outputs_=getattr(self._clf, 'n_outputs_', None), | |||
tree_=getattr(self._clf, 'tree_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.classes_ = params['classes_'] | |||
self._clf.max_features_ = params['max_features_'] | |||
self._clf.n_classes_ = params['n_classes_'] | |||
self._clf.n_features_ = params['n_features_'] | |||
self._clf.n_outputs_ = params['n_outputs_'] | |||
self._clf.tree_ = params['tree_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
if params['max_features_'] is not None: | |||
self._fitted = True | |||
if params['n_classes_'] is not None: | |||
self._fitted = True | |||
if params['n_features_'] is not None: | |||
self._fitted = True | |||
if params['n_outputs_'] is not None: | |||
self._fitted = True | |||
if params['tree_'] is not None: | |||
self._fitted = True | |||
def log_likelihoods(self, *, | |||
outputs: Outputs, | |||
inputs: Inputs, | |||
timeout: float = None, | |||
iterations: int = None) -> CallResult[Sequence[float]]: | |||
inputs = inputs.iloc[:, self._training_indices] # Get ndarray | |||
outputs = outputs.iloc[:, self._target_column_indices] | |||
if len(inputs.columns) and len(outputs.columns): | |||
if outputs.shape[1] != self._clf.n_outputs_: | |||
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") | |||
log_proba = self._clf.predict_log_proba(inputs) | |||
# Making it always a list, even when only one target. | |||
if self._clf.n_outputs_ == 1: | |||
log_proba = [log_proba] | |||
classes = [self._clf.classes_] | |||
else: | |||
classes = self._clf.classes_ | |||
samples_length = inputs.shape[0] | |||
log_likelihoods = [] | |||
for k in range(self._clf.n_outputs_): | |||
# We have to map each class to its internal (numerical) index used in the learner. | |||
# This allows "outputs" to contain string classes. | |||
outputs_column = outputs.iloc[:, k] | |||
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) | |||
mapped_outputs_column = outputs_column.map(classes_map) | |||
# For each target column (column in "outputs"), for each sample (row) we pick the log | |||
# likelihood for a given class. | |||
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) | |||
results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) | |||
results.columns = outputs.columns | |||
for k in range(self._clf.n_outputs_): | |||
column_metadata = outputs.metadata.query_column(k) | |||
if 'name' in column_metadata: | |||
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) | |||
else: | |||
results = d3m_dataframe(generate_metadata=True) | |||
return CallResult(results) | |||
def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: | |||
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) | |||
output.columns = self._input_column_names | |||
for i in range(len(self._input_column_names)): | |||
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) | |||
return CallResult(output) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKDecisionTreeClassifier.__doc__ = DecisionTreeClassifier.__doc__ |
@@ -1,565 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.tree.tree import DecisionTreeRegressor | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
max_features_: Optional[int] | |||
n_features_: Optional[int] | |||
n_outputs_: Optional[int] | |||
tree_: Optional[object] | |||
classes_: Optional[Union[ndarray, List[ndarray]]] | |||
n_classes_: Optional[Union[numpy.int64, List[numpy.int64]]] | |||
class_weight: Optional[Union[str, dict, List[dict]]] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
criterion = hyperparams.Enumeration[str]( | |||
values=['mse', 'friedman_mse', 'mae'], | |||
default='mse', | |||
description='The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
splitter = hyperparams.Enumeration[str]( | |||
values=['best', 'random'], | |||
default='best', | |||
description='The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_depth = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=5, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_split = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'float': hyperparams.Bounded[float]( | |||
lower=0, | |||
upper=1, | |||
default=1.0, | |||
description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=2, | |||
description='Minimum number.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='int', | |||
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_leaf = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'percent': hyperparams.Bounded[float]( | |||
lower=0, | |||
upper=0.5, | |||
default=0.25, | |||
description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'absolute': hyperparams.Bounded[int]( | |||
lower=1, | |||
upper=None, | |||
default=1, | |||
description='Minimum number.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='absolute', | |||
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_weight_fraction_leaf = hyperparams.Bounded[float]( | |||
default=0, | |||
lower=0, | |||
upper=0.5, | |||
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_leaf_nodes = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=10, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_features = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'specified_int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'calculated': hyperparams.Enumeration[str]( | |||
values=['auto', 'sqrt', 'log2'], | |||
default='auto', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='calculated', | |||
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_impurity_decrease = hyperparams.Bounded[float]( | |||
default=0.0, | |||
lower=0.0, | |||
upper=None, | |||
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
presort = hyperparams.UniformBool( | |||
default=False, | |||
description='Whether to presort the data to speed up the finding of best splits in fitting. For the default settings of a decision tree on large datasets, setting this to true may slow down the training process. When using either a smaller dataset or a restricted depth, this may speed up the training.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKDecisionTreeRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn DecisionTreeRegressor | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], | |||
"name": "sklearn.tree.tree.DecisionTreeRegressor", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.decision_tree.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html']}, | |||
"version": "2019.11.13", | |||
"id": "6c420bd8-01d1-321f-9a35-afc4b758a5c6", | |||
"hyperparams_to_tune": ['max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = DecisionTreeRegressor( | |||
criterion=self.hyperparams['criterion'], | |||
splitter=self.hyperparams['splitter'], | |||
max_depth=self.hyperparams['max_depth'], | |||
min_samples_split=self.hyperparams['min_samples_split'], | |||
min_samples_leaf=self.hyperparams['min_samples_leaf'], | |||
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], | |||
max_leaf_nodes=self.hyperparams['max_leaf_nodes'], | |||
max_features=self.hyperparams['max_features'], | |||
min_impurity_decrease=self.hyperparams['min_impurity_decrease'], | |||
presort=self.hyperparams['presort'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
max_features_=None, | |||
n_features_=None, | |||
n_outputs_=None, | |||
tree_=None, | |||
classes_=None, | |||
n_classes_=None, | |||
class_weight=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
max_features_=getattr(self._clf, 'max_features_', None), | |||
n_features_=getattr(self._clf, 'n_features_', None), | |||
n_outputs_=getattr(self._clf, 'n_outputs_', None), | |||
tree_=getattr(self._clf, 'tree_', None), | |||
classes_=getattr(self._clf, 'classes_', None), | |||
n_classes_=getattr(self._clf, 'n_classes_', None), | |||
class_weight=getattr(self._clf, 'class_weight', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.max_features_ = params['max_features_'] | |||
self._clf.n_features_ = params['n_features_'] | |||
self._clf.n_outputs_ = params['n_outputs_'] | |||
self._clf.tree_ = params['tree_'] | |||
self._clf.classes_ = params['classes_'] | |||
self._clf.n_classes_ = params['n_classes_'] | |||
self._clf.class_weight = params['class_weight'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['max_features_'] is not None: | |||
self._fitted = True | |||
if params['n_features_'] is not None: | |||
self._fitted = True | |||
if params['n_outputs_'] is not None: | |||
self._fitted = True | |||
if params['tree_'] is not None: | |||
self._fitted = True | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
if params['n_classes_'] is not None: | |||
self._fitted = True | |||
if params['class_weight'] is not None: | |||
self._fitted = True | |||
def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: | |||
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) | |||
output.columns = self._input_column_names | |||
for i in range(len(self._input_column_names)): | |||
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) | |||
return CallResult(output) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKDecisionTreeRegressor.__doc__ = DecisionTreeRegressor.__doc__ |
@@ -1,503 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.dummy import DummyClassifier | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
classes_: Optional[ndarray] | |||
n_classes_: Optional[Union[int,ndarray]] | |||
class_prior_: Optional[ndarray] | |||
n_outputs_: Optional[int] | |||
sparse_output_: Optional[bool] | |||
output_2d_: Optional[bool] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
strategy = hyperparams.Choice( | |||
choices={ | |||
'stratified': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
), | |||
'most_frequent': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
), | |||
'prior': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
), | |||
'uniform': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
), | |||
'constant': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({ | |||
'constant': hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'str': hyperparams.Hyperparameter[str]( | |||
default='one', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'int': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'ndarray': hyperparams.Hyperparameter[ndarray]( | |||
default=numpy.array([]), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='int', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
}) | |||
) | |||
}, | |||
default='stratified', | |||
description='Strategy to use to generate predictions. * "stratified": generates predictions by respecting the training set\'s class distribution. * "most_frequent": always predicts the most frequent label in the training set. * "prior": always predicts the class that maximizes the class prior (like "most_frequent") and ``predict_proba`` returns the class prior. * "uniform": generates predictions uniformly at random. * "constant": always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class .. versionadded:: 0.17 Dummy Classifier now supports prior fitting strategy using parameter *prior*.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKDummyClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], | |||
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn DummyClassifier | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.RULE_BASED_MACHINE_LEARNING, ], | |||
"name": "sklearn.dummy.DummyClassifier", | |||
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, | |||
"python_path": "d3m.primitives.classification.dummy.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html']}, | |||
"version": "2019.11.13", | |||
"id": "a1056ddf-2e89-3d8d-8308-2146170ae54d", | |||
"hyperparams_to_tune": ['strategy'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = DummyClassifier( | |||
strategy=self.hyperparams['strategy']['choice'], | |||
constant=self.hyperparams['strategy'].get('constant', 'int'), | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
classes_=None, | |||
n_classes_=None, | |||
class_prior_=None, | |||
n_outputs_=None, | |||
sparse_output_=None, | |||
output_2d_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
classes_=getattr(self._clf, 'classes_', None), | |||
n_classes_=getattr(self._clf, 'n_classes_', None), | |||
class_prior_=getattr(self._clf, 'class_prior_', None), | |||
n_outputs_=getattr(self._clf, 'n_outputs_', None), | |||
sparse_output_=getattr(self._clf, 'sparse_output_', None), | |||
output_2d_=getattr(self._clf, 'output_2d_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.classes_ = params['classes_'] | |||
self._clf.n_classes_ = params['n_classes_'] | |||
self._clf.class_prior_ = params['class_prior_'] | |||
self._clf.n_outputs_ = params['n_outputs_'] | |||
self._clf.sparse_output_ = params['sparse_output_'] | |||
self._clf.output_2d_ = params['output_2d_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
if params['n_classes_'] is not None: | |||
self._fitted = True | |||
if params['class_prior_'] is not None: | |||
self._fitted = True | |||
if params['n_outputs_'] is not None: | |||
self._fitted = True | |||
if params['sparse_output_'] is not None: | |||
self._fitted = True | |||
if params['output_2d_'] is not None: | |||
self._fitted = True | |||
def log_likelihoods(self, *, | |||
outputs: Outputs, | |||
inputs: Inputs, | |||
timeout: float = None, | |||
iterations: int = None) -> CallResult[Sequence[float]]: | |||
inputs = inputs.iloc[:, self._training_indices] # Get ndarray | |||
outputs = outputs.iloc[:, self._target_column_indices] | |||
if len(inputs.columns) and len(outputs.columns): | |||
if outputs.shape[1] != self._clf.n_outputs_: | |||
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") | |||
log_proba = self._clf.predict_log_proba(inputs) | |||
# Making it always a list, even when only one target. | |||
if self._clf.n_outputs_ == 1: | |||
log_proba = [log_proba] | |||
classes = [self._clf.classes_] | |||
else: | |||
classes = self._clf.classes_ | |||
samples_length = inputs.shape[0] | |||
log_likelihoods = [] | |||
for k in range(self._clf.n_outputs_): | |||
# We have to map each class to its internal (numerical) index used in the learner. | |||
# This allows "outputs" to contain string classes. | |||
outputs_column = outputs.iloc[:, k] | |||
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) | |||
mapped_outputs_column = outputs_column.map(classes_map) | |||
# For each target column (column in "outputs"), for each sample (row) we pick the log | |||
# likelihood for a given class. | |||
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) | |||
results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) | |||
results.columns = outputs.columns | |||
for k in range(self._clf.n_outputs_): | |||
column_metadata = outputs.metadata.query_column(k) | |||
if 'name' in column_metadata: | |||
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) | |||
else: | |||
results = d3m_dataframe(generate_metadata=True) | |||
return CallResult(results) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKDummyClassifier.__doc__ = DummyClassifier.__doc__ |
@@ -1,442 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.dummy import DummyRegressor | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
constant_: Optional[Union[float, ndarray]] | |||
n_outputs_: Optional[int] | |||
output_2d_: Optional[bool] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
strategy = hyperparams.Choice( | |||
choices={ | |||
'mean': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
), | |||
'median': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
), | |||
'quantile': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({ | |||
'quantile': hyperparams.Uniform( | |||
default=0.5, | |||
lower=0, | |||
upper=1.0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
}) | |||
), | |||
'constant': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({ | |||
'constant': hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'float': hyperparams.Bounded[float]( | |||
lower=0, | |||
upper=None, | |||
default=1.0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'int': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'ndarray': hyperparams.Hyperparameter[ndarray]( | |||
default=numpy.array([]), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='float', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
}) | |||
) | |||
}, | |||
default='mean', | |||
description='Strategy to use to generate predictions. * "mean": always predicts the mean of the training set * "median": always predicts the median of the training set * "quantile": always predicts a specified quantile of the training set, provided with the quantile parameter. * "constant": always predicts a constant value that is provided by the user.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKDummyRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn DummyRegressor | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.RULE_BASED_MACHINE_LEARNING, ], | |||
"name": "sklearn.dummy.DummyRegressor", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.dummy.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html']}, | |||
"version": "2019.11.13", | |||
"id": "05aa5b6a-3b27-34dc-9ba7-8511fb13f253", | |||
"hyperparams_to_tune": ['strategy'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = DummyRegressor( | |||
strategy=self.hyperparams['strategy']['choice'], | |||
quantile=self.hyperparams['strategy'].get('quantile', 0.5), | |||
constant=self.hyperparams['strategy'].get('constant', 'float'), | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
constant_=None, | |||
n_outputs_=None, | |||
output_2d_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
constant_=getattr(self._clf, 'constant_', None), | |||
n_outputs_=getattr(self._clf, 'n_outputs_', None), | |||
output_2d_=getattr(self._clf, 'output_2d_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.constant_ = params['constant_'] | |||
self._clf.n_outputs_ = params['n_outputs_'] | |||
self._clf.output_2d_ = params['output_2d_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['constant_'] is not None: | |||
self._fitted = True | |||
if params['n_outputs_'] is not None: | |||
self._fitted = True | |||
if params['output_2d_'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKDummyRegressor.__doc__ = DummyRegressor.__doc__ |
@@ -1,466 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.linear_model.coordinate_descent import ElasticNet | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
coef_: Optional[ndarray] | |||
intercept_: Optional[float] | |||
n_iter_: Optional[int] | |||
dual_gap_: Optional[float] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
alpha = hyperparams.Bounded[float]( | |||
default=1.0, | |||
lower=0, | |||
upper=None, | |||
description='Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter.``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
l1_ratio = hyperparams.Uniform( | |||
default=0.5, | |||
lower=0, | |||
upper=1, | |||
description='The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
fit_intercept = hyperparams.UniformBool( | |||
default=True, | |||
description='Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
normalize = hyperparams.UniformBool( | |||
default=False, | |||
description='This parameter is ignored when ``fit_intercept`` is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. If you wish to standardize, please use :class:`sklearn.preprocessing.StandardScaler` before calling ``fit`` on an estimator with ``normalize=False``.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
precompute = hyperparams.UniformBool( | |||
default=False, | |||
description='Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``True`` to preserve sparsity.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] | |||
) | |||
max_iter = hyperparams.Bounded[int]( | |||
default=1000, | |||
lower=0, | |||
upper=None, | |||
description='The maximum number of iterations', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
tol = hyperparams.Bounded[float]( | |||
default=0.0001, | |||
lower=0, | |||
upper=None, | |||
description='The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
positive = hyperparams.UniformBool( | |||
default=False, | |||
description='When set to ``True``, forces the coefficients to be positive.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
selection = hyperparams.Enumeration[str]( | |||
default='cyclic', | |||
values=['cyclic', 'random'], | |||
description='If set to \'random\', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to \'random\') often leads to significantly faster convergence especially when tol is higher than 1e-4.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
warm_start = hyperparams.UniformBool( | |||
default=False, | |||
description='When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary <warm_start>`.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKElasticNet(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn ElasticNet | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.ELASTIC_NET_REGULARIZATION, ], | |||
"name": "sklearn.linear_model.coordinate_descent.ElasticNet", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.elastic_net.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html']}, | |||
"version": "2019.11.13", | |||
"id": "a85d4ffb-49ab-35b1-a70c-6df209312aae", | |||
"hyperparams_to_tune": ['alpha', 'max_iter', 'l1_ratio'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = ElasticNet( | |||
alpha=self.hyperparams['alpha'], | |||
l1_ratio=self.hyperparams['l1_ratio'], | |||
fit_intercept=self.hyperparams['fit_intercept'], | |||
normalize=self.hyperparams['normalize'], | |||
precompute=self.hyperparams['precompute'], | |||
max_iter=self.hyperparams['max_iter'], | |||
tol=self.hyperparams['tol'], | |||
positive=self.hyperparams['positive'], | |||
selection=self.hyperparams['selection'], | |||
warm_start=self.hyperparams['warm_start'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
coef_=None, | |||
intercept_=None, | |||
n_iter_=None, | |||
dual_gap_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
coef_=getattr(self._clf, 'coef_', None), | |||
intercept_=getattr(self._clf, 'intercept_', None), | |||
n_iter_=getattr(self._clf, 'n_iter_', None), | |||
dual_gap_=getattr(self._clf, 'dual_gap_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.coef_ = params['coef_'] | |||
self._clf.intercept_ = params['intercept_'] | |||
self._clf.n_iter_ = params['n_iter_'] | |||
self._clf.dual_gap_ = params['dual_gap_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['coef_'] is not None: | |||
self._fitted = True | |||
if params['intercept_'] is not None: | |||
self._fitted = True | |||
if params['n_iter_'] is not None: | |||
self._fitted = True | |||
if params['dual_gap_'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKElasticNet.__doc__ = ElasticNet.__doc__ |
@@ -1,675 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.ensemble.forest import ExtraTreesClassifier | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
estimators_: Optional[Sequence[sklearn.base.BaseEstimator]] | |||
classes_: Optional[Union[ndarray, List[ndarray]]] | |||
n_classes_: Optional[Union[int, List[int]]] | |||
n_features_: Optional[int] | |||
n_outputs_: Optional[int] | |||
oob_score_: Optional[float] | |||
oob_decision_function_: Optional[ndarray] | |||
base_estimator_: Optional[object] | |||
estimator_params: Optional[tuple] | |||
base_estimator: Optional[object] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
n_estimators = hyperparams.Bounded[int]( | |||
default=10, | |||
lower=1, | |||
upper=None, | |||
description='The number of trees in the forest.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
criterion = hyperparams.Enumeration[str]( | |||
values=['gini', 'entropy'], | |||
default='gini', | |||
description='The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "entropy" for the information gain.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_depth = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=10, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_split = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
default=2, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='absolute', | |||
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_leaf = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'absolute': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=0.5, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='absolute', | |||
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_weight_fraction_leaf = hyperparams.Bounded[float]( | |||
default=0, | |||
lower=0, | |||
upper=0.5, | |||
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_leaf_nodes = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
default=10, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_features = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'specified_int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'calculated': hyperparams.Enumeration[str]( | |||
values=['auto', 'sqrt', 'log2'], | |||
default='auto', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='calculated', | |||
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=sqrt(n_features)`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_impurity_decrease = hyperparams.Bounded[float]( | |||
default=0.0, | |||
lower=0.0, | |||
upper=None, | |||
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
bootstrap = hyperparams.Enumeration[str]( | |||
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], | |||
default='bootstrap', | |||
description='Whether bootstrap samples are used when building trees.' | |||
' And whether to use out-of-bag samples to estimate the generalization accuracy.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
n_jobs = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'limit': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'all_cores': hyperparams.Constant( | |||
default=-1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='limit', | |||
description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] | |||
) | |||
warm_start = hyperparams.UniformBool( | |||
default=False, | |||
description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
class_weight = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'str': hyperparams.Enumeration[str]( | |||
default='balanced', | |||
values=['balanced', 'balanced_subsample'], | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` The "balanced_subsample" mode is the same as "balanced" except that weights are computed based on the bootstrap sample for every tree grown. For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKExtraTreesClassifier(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], | |||
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn ExtraTreesClassifier | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], | |||
"name": "sklearn.ensemble.forest.ExtraTreesClassifier", | |||
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, | |||
"python_path": "d3m.primitives.classification.extra_trees.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html']}, | |||
"version": "2019.11.13", | |||
"id": "c8a28f02-ef4a-35a8-87f1-cf79980f5c3e", | |||
"hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None, | |||
_verbose: int = 0) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = ExtraTreesClassifier( | |||
n_estimators=self.hyperparams['n_estimators'], | |||
criterion=self.hyperparams['criterion'], | |||
max_depth=self.hyperparams['max_depth'], | |||
min_samples_split=self.hyperparams['min_samples_split'], | |||
min_samples_leaf=self.hyperparams['min_samples_leaf'], | |||
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], | |||
max_leaf_nodes=self.hyperparams['max_leaf_nodes'], | |||
max_features=self.hyperparams['max_features'], | |||
min_impurity_decrease=self.hyperparams['min_impurity_decrease'], | |||
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], | |||
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], | |||
n_jobs=self.hyperparams['n_jobs'], | |||
warm_start=self.hyperparams['warm_start'], | |||
class_weight=self.hyperparams['class_weight'], | |||
random_state=self.random_seed, | |||
verbose=_verbose | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
estimators_=None, | |||
classes_=None, | |||
n_classes_=None, | |||
n_features_=None, | |||
n_outputs_=None, | |||
oob_score_=None, | |||
oob_decision_function_=None, | |||
base_estimator_=None, | |||
estimator_params=None, | |||
base_estimator=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
estimators_=getattr(self._clf, 'estimators_', None), | |||
classes_=getattr(self._clf, 'classes_', None), | |||
n_classes_=getattr(self._clf, 'n_classes_', None), | |||
n_features_=getattr(self._clf, 'n_features_', None), | |||
n_outputs_=getattr(self._clf, 'n_outputs_', None), | |||
oob_score_=getattr(self._clf, 'oob_score_', None), | |||
oob_decision_function_=getattr(self._clf, 'oob_decision_function_', None), | |||
base_estimator_=getattr(self._clf, 'base_estimator_', None), | |||
estimator_params=getattr(self._clf, 'estimator_params', None), | |||
base_estimator=getattr(self._clf, 'base_estimator', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.estimators_ = params['estimators_'] | |||
self._clf.classes_ = params['classes_'] | |||
self._clf.n_classes_ = params['n_classes_'] | |||
self._clf.n_features_ = params['n_features_'] | |||
self._clf.n_outputs_ = params['n_outputs_'] | |||
self._clf.oob_score_ = params['oob_score_'] | |||
self._clf.oob_decision_function_ = params['oob_decision_function_'] | |||
self._clf.base_estimator_ = params['base_estimator_'] | |||
self._clf.estimator_params = params['estimator_params'] | |||
self._clf.base_estimator = params['base_estimator'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['estimators_'] is not None: | |||
self._fitted = True | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
if params['n_classes_'] is not None: | |||
self._fitted = True | |||
if params['n_features_'] is not None: | |||
self._fitted = True | |||
if params['n_outputs_'] is not None: | |||
self._fitted = True | |||
if params['oob_score_'] is not None: | |||
self._fitted = True | |||
if params['oob_decision_function_'] is not None: | |||
self._fitted = True | |||
if params['base_estimator_'] is not None: | |||
self._fitted = True | |||
if params['estimator_params'] is not None: | |||
self._fitted = True | |||
if params['base_estimator'] is not None: | |||
self._fitted = True | |||
def log_likelihoods(self, *, | |||
outputs: Outputs, | |||
inputs: Inputs, | |||
timeout: float = None, | |||
iterations: int = None) -> CallResult[Sequence[float]]: | |||
inputs = inputs.iloc[:, self._training_indices] # Get ndarray | |||
outputs = outputs.iloc[:, self._target_column_indices] | |||
if len(inputs.columns) and len(outputs.columns): | |||
if outputs.shape[1] != self._clf.n_outputs_: | |||
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") | |||
log_proba = self._clf.predict_log_proba(inputs) | |||
# Making it always a list, even when only one target. | |||
if self._clf.n_outputs_ == 1: | |||
log_proba = [log_proba] | |||
classes = [self._clf.classes_] | |||
else: | |||
classes = self._clf.classes_ | |||
samples_length = inputs.shape[0] | |||
log_likelihoods = [] | |||
for k in range(self._clf.n_outputs_): | |||
# We have to map each class to its internal (numerical) index used in the learner. | |||
# This allows "outputs" to contain string classes. | |||
outputs_column = outputs.iloc[:, k] | |||
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) | |||
mapped_outputs_column = outputs_column.map(classes_map) | |||
# For each target column (column in "outputs"), for each sample (row) we pick the log | |||
# likelihood for a given class. | |||
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) | |||
results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) | |||
results.columns = outputs.columns | |||
for k in range(self._clf.n_outputs_): | |||
column_metadata = outputs.metadata.query_column(k) | |||
if 'name' in column_metadata: | |||
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) | |||
else: | |||
results = d3m_dataframe(generate_metadata=True) | |||
return CallResult(results) | |||
def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: | |||
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) | |||
output.columns = self._input_column_names | |||
for i in range(len(self._input_column_names)): | |||
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) | |||
return CallResult(output) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKExtraTreesClassifier.__doc__ = ExtraTreesClassifier.__doc__ |
@@ -1,607 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.ensemble.forest import ExtraTreesRegressor | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
estimators_: Optional[List[sklearn.tree.ExtraTreeRegressor]] | |||
n_features_: Optional[int] | |||
n_outputs_: Optional[int] | |||
oob_score_: Optional[float] | |||
oob_prediction_: Optional[ndarray] | |||
base_estimator_: Optional[object] | |||
estimator_params: Optional[tuple] | |||
class_weight: Optional[Union[str, dict, List[dict]]] | |||
base_estimator: Optional[object] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
n_estimators = hyperparams.Bounded[int]( | |||
default=10, | |||
lower=1, | |||
upper=None, | |||
description='The number of trees in the forest.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
criterion = hyperparams.Enumeration[str]( | |||
values=['mse', 'mae'], | |||
default='mse', | |||
description='The function to measure the quality of a split. Supported criteria are "mse" for the mean squared error, which is equal to variance reduction as feature selection criterion, and "mae" for the mean absolute error. .. versionadded:: 0.18 Mean Absolute Error (MAE) criterion.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_depth = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=5, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_split = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'float': hyperparams.Bounded[float]( | |||
lower=0, | |||
upper=1, | |||
default=1.0, | |||
description='It\'s a percentage and `ceil(min_samples_split * n_samples)` is the minimum number of samples for each split.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=2, | |||
description='Minimum number.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='int', | |||
description='The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a percentage and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_samples_leaf = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'percent': hyperparams.Bounded[float]( | |||
lower=0, | |||
upper=0.5, | |||
default=0.25, | |||
description='It\'s a percentage and `ceil(min_samples_leaf * n_samples)` is the minimum number of samples for each node.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'absolute': hyperparams.Bounded[int]( | |||
lower=1, | |||
upper=None, | |||
default=1, | |||
description='Minimum number.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='absolute', | |||
description='The minimum number of samples required to be at a leaf node: - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a percentage and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for percentages.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_weight_fraction_leaf = hyperparams.Bounded[float]( | |||
default=0, | |||
lower=0, | |||
upper=0.5, | |||
description='The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_leaf_nodes = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=10, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_features = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'specified_int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'calculated': hyperparams.Enumeration[str]( | |||
values=['auto', 'sqrt', 'log2'], | |||
default='auto', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'percent': hyperparams.Bounded[float]( | |||
default=0.25, | |||
lower=0, | |||
upper=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='calculated', | |||
description='The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a percentage and `int(max_features * n_features)` features are considered at each split. - If "auto", then `max_features=n_features`. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
min_impurity_decrease = hyperparams.Bounded[float]( | |||
default=0.0, | |||
lower=0.0, | |||
upper=None, | |||
description='A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19 ', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
bootstrap = hyperparams.Enumeration[str]( | |||
values=['bootstrap', 'bootstrap_with_oob_score', 'disabled'], | |||
default='bootstrap', | |||
description='Whether bootstrap samples are used when building trees.' | |||
' And whether to use out-of-bag samples to estimate the generalization accuracy.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
warm_start = hyperparams.UniformBool( | |||
default=False, | |||
description='When set to ``True``, reuse the solution of the previous call to fit and add more estimators to the ensemble, otherwise, just fit a whole new forest.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
n_jobs = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'limit': hyperparams.Bounded[int]( | |||
default=1, | |||
lower=1, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'all_cores': hyperparams.Constant( | |||
default=-1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='limit', | |||
description='The number of jobs to run in parallel for both `fit` and `predict`. If -1, then the number of jobs is set to the number of cores.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ResourcesUseParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKExtraTreesRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn ExtraTreesRegressor | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DECISION_TREE, ], | |||
"name": "sklearn.ensemble.forest.ExtraTreesRegressor", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.extra_trees.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesRegressor.html']}, | |||
"version": "2019.11.13", | |||
"id": "35321059-2a1a-31fd-9509-5494efc751c7", | |||
"hyperparams_to_tune": ['n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'max_features'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None, | |||
_verbose: int = 0) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = ExtraTreesRegressor( | |||
n_estimators=self.hyperparams['n_estimators'], | |||
criterion=self.hyperparams['criterion'], | |||
max_depth=self.hyperparams['max_depth'], | |||
min_samples_split=self.hyperparams['min_samples_split'], | |||
min_samples_leaf=self.hyperparams['min_samples_leaf'], | |||
min_weight_fraction_leaf=self.hyperparams['min_weight_fraction_leaf'], | |||
max_leaf_nodes=self.hyperparams['max_leaf_nodes'], | |||
max_features=self.hyperparams['max_features'], | |||
min_impurity_decrease=self.hyperparams['min_impurity_decrease'], | |||
bootstrap=self.hyperparams['bootstrap'] in ['bootstrap', 'bootstrap_with_oob_score'], | |||
oob_score=self.hyperparams['bootstrap'] in ['bootstrap_with_oob_score'], | |||
warm_start=self.hyperparams['warm_start'], | |||
n_jobs=self.hyperparams['n_jobs'], | |||
random_state=self.random_seed, | |||
verbose=_verbose | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
estimators_=None, | |||
n_features_=None, | |||
n_outputs_=None, | |||
oob_score_=None, | |||
oob_prediction_=None, | |||
base_estimator_=None, | |||
estimator_params=None, | |||
class_weight=None, | |||
base_estimator=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
estimators_=getattr(self._clf, 'estimators_', None), | |||
n_features_=getattr(self._clf, 'n_features_', None), | |||
n_outputs_=getattr(self._clf, 'n_outputs_', None), | |||
oob_score_=getattr(self._clf, 'oob_score_', None), | |||
oob_prediction_=getattr(self._clf, 'oob_prediction_', None), | |||
base_estimator_=getattr(self._clf, 'base_estimator_', None), | |||
estimator_params=getattr(self._clf, 'estimator_params', None), | |||
class_weight=getattr(self._clf, 'class_weight', None), | |||
base_estimator=getattr(self._clf, 'base_estimator', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.estimators_ = params['estimators_'] | |||
self._clf.n_features_ = params['n_features_'] | |||
self._clf.n_outputs_ = params['n_outputs_'] | |||
self._clf.oob_score_ = params['oob_score_'] | |||
self._clf.oob_prediction_ = params['oob_prediction_'] | |||
self._clf.base_estimator_ = params['base_estimator_'] | |||
self._clf.estimator_params = params['estimator_params'] | |||
self._clf.class_weight = params['class_weight'] | |||
self._clf.base_estimator = params['base_estimator'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['estimators_'] is not None: | |||
self._fitted = True | |||
if params['n_features_'] is not None: | |||
self._fitted = True | |||
if params['n_outputs_'] is not None: | |||
self._fitted = True | |||
if params['oob_score_'] is not None: | |||
self._fitted = True | |||
if params['oob_prediction_'] is not None: | |||
self._fitted = True | |||
if params['base_estimator_'] is not None: | |||
self._fitted = True | |||
if params['estimator_params'] is not None: | |||
self._fitted = True | |||
if params['class_weight'] is not None: | |||
self._fitted = True | |||
if params['base_estimator'] is not None: | |||
self._fitted = True | |||
def produce_feature_importances(self, *, timeout: float = None, iterations: int = None) -> CallResult[d3m_dataframe]: | |||
output = d3m_dataframe(self._clf.feature_importances_.reshape((1, len(self._input_column_names)))) | |||
output.columns = self._input_column_names | |||
for i in range(len(self._input_column_names)): | |||
output.metadata = output.metadata.update_column(i, {"name": self._input_column_names[i]}) | |||
return CallResult(output) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKExtraTreesRegressor.__doc__ = ExtraTreesRegressor.__doc__ |
@@ -1,439 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.decomposition.fastica_ import FastICA | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
n_iter_: Optional[int] | |||
mixing_: Optional[ndarray] | |||
components_: Optional[ndarray] | |||
mean_: Optional[ndarray] | |||
whitening_: Optional[ndarray] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
n_components = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=0, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
description='All components are used.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Number of components to extract. If None no dimension reduction is performed.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
algorithm = hyperparams.Enumeration[str]( | |||
default='parallel', | |||
values=['parallel', 'deflation'], | |||
description='Apply a parallel or deflational FASTICA algorithm.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
whiten = hyperparams.UniformBool( | |||
default=True, | |||
description='If True perform an initial whitening of the data. If False, the data is assumed to have already been preprocessed: it should be centered, normed and white. Otherwise you will get incorrect results. In this case the parameter n_components will be ignored.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
fun = hyperparams.Choice( | |||
choices={ | |||
'logcosh': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({ | |||
'alpha': hyperparams.Hyperparameter[float]( | |||
default=1, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
}) | |||
), | |||
'exp': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
), | |||
'cube': hyperparams.Hyperparams.define( | |||
configuration=OrderedDict({}) | |||
) | |||
}, | |||
default='logcosh', | |||
description='The functional form of the G function used in the approximation to neg-entropy. Could be either \'logcosh\', \'exp\', or \'cube\'. You can also provide your own function. It should return a tuple containing the value of the function, and of its derivative, in the point. Example: def my_g(x): return x ** 3, 3 * x ** 2', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
max_iter = hyperparams.Bounded[int]( | |||
default=200, | |||
lower=0, | |||
upper=None, | |||
description='Maximum number of iterations to perform. tol: float, optional A positive scalar giving the tolerance at which the un-mixing matrix is considered to have converged.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
tol = hyperparams.Bounded[float]( | |||
default=0.0001, | |||
lower=0, | |||
upper=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
w_init = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'ndarray': hyperparams.Hyperparameter[ndarray]( | |||
default=numpy.array([]), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'none': hyperparams.Constant( | |||
default=None, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='none', | |||
description='Initial un-mixing array of dimension (n.comp,n.comp). If None (default) then an array of normal r.v.\'s is used.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
use_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||
description='Decides what semantic type to attach to generated attributes', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKFastICA(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn FastICA | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.PRINCIPAL_COMPONENT_ANALYSIS, ], | |||
"name": "sklearn.decomposition.fastica_.FastICA", | |||
"primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, | |||
"python_path": "d3m.primitives.data_transformation.fast_ica.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html']}, | |||
"version": "2019.11.13", | |||
"id": "03633ffa-425e-37d4-9f1c-bbb552f1e995", | |||
"hyperparams_to_tune": ['n_components', 'algorithm'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = FastICA( | |||
n_components=self.hyperparams['n_components'], | |||
algorithm=self.hyperparams['algorithm'], | |||
whiten=self.hyperparams['whiten'], | |||
fun=self.hyperparams['fun']['choice'], | |||
fun_args=self.hyperparams['fun'], | |||
max_iter=self.hyperparams['max_iter'], | |||
tol=self.hyperparams['tol'], | |||
w_init=self.hyperparams['w_init'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
def set_training_data(self, *, inputs: Inputs) -> None: | |||
self._inputs = inputs | |||
self._fitted = False | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._fitted: | |||
return CallResult(None) | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if self._training_inputs is None: | |||
return CallResult(None) | |||
if len(self._training_indices) > 0: | |||
self._clf.fit(self._training_inputs) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
sk_inputs = inputs | |||
if self.hyperparams['use_semantic_types']: | |||
sk_inputs = inputs.iloc[:, self._training_indices] | |||
output_columns = [] | |||
if len(self._training_indices) > 0: | |||
sk_output = self._clf.transform(sk_inputs) | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
outputs = self._wrap_predictions(inputs, sk_output) | |||
if len(outputs.columns) == len(self._input_column_names): | |||
outputs.columns = self._input_column_names | |||
output_columns = [outputs] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._training_indices, | |||
columns_list=output_columns) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
n_iter_=None, | |||
mixing_=None, | |||
components_=None, | |||
mean_=None, | |||
whitening_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
n_iter_=getattr(self._clf, 'n_iter_', None), | |||
mixing_=getattr(self._clf, 'mixing_', None), | |||
components_=getattr(self._clf, 'components_', None), | |||
mean_=getattr(self._clf, 'mean_', None), | |||
whitening_=getattr(self._clf, 'whitening_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.n_iter_ = params['n_iter_'] | |||
self._clf.mixing_ = params['mixing_'] | |||
self._clf.components_ = params['components_'] | |||
self._clf.mean_ = params['mean_'] | |||
self._clf.whitening_ = params['whitening_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['n_iter_'] is not None: | |||
self._fitted = True | |||
if params['mixing_'] is not None: | |||
self._fitted = True | |||
if params['components_'] is not None: | |||
self._fitted = True | |||
if params['mean_'] is not None: | |||
self._fitted = True | |||
if params['whitening_'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_columns'], | |||
exclude_columns=hyperparams['exclude_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = [] | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||
target_columns_metadata = self._copy_inputs_metadata(inputs.metadata, self._training_indices, outputs.metadata, self.hyperparams) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _copy_inputs_metadata(cls, inputs_metadata: metadata_base.DataMetadata, input_indices: List[int], | |||
outputs_metadata: metadata_base.DataMetadata, hyperparams): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in input_indices: | |||
column_name = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata = OrderedDict(inputs_metadata.query_column(column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = set() | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
# If outputs has more columns than index, add Attribute Type to all remaining | |||
if outputs_length > len(input_indices): | |||
for column_index in range(len(input_indices), outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = set() | |||
semantic_types.add(hyperparams["return_semantic_type"]) | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKFastICA.__doc__ = FastICA.__doc__ |
@@ -1,361 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.cluster.hierarchical import FeatureAgglomeration | |||
from numpy import mean as npmean | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
labels_: Optional[ndarray] | |||
n_leaves_: Optional[int] | |||
children_: Optional[ndarray] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
n_clusters = hyperparams.Bounded[int]( | |||
default=2, | |||
lower=0, | |||
upper=None, | |||
description='The number of clusters to find.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
affinity = hyperparams.Enumeration[str]( | |||
default='euclidean', | |||
values=['euclidean', 'l1', 'l2', 'manhattan', 'cosine', 'precomputed'], | |||
description='Metric used to compute the linkage. Can be "euclidean", "l1", "l2", "manhattan", "cosine", or \'precomputed\'. If linkage is "ward", only "euclidean" is accepted.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
compute_full_tree = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'auto': hyperparams.Constant( | |||
default='auto', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'bool': hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='auto', | |||
description='Stop early the construction of the tree at n_clusters. This is useful to decrease computation time if the number of clusters is not small compared to the number of features. This option is useful only when specifying a connectivity matrix. Note also that when varying the number of clusters and using caching, it may be advantageous to compute the full tree.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
linkage = hyperparams.Enumeration[str]( | |||
default='ward', | |||
values=['ward', 'complete', 'average', 'single'], | |||
description='Which linkage criterion to use. The linkage criterion determines which distance to use between sets of features. The algorithm will merge the pairs of cluster that minimize this criterion. - ward minimizes the variance of the clusters being merged. - average uses the average of the distances of each feature of the two sets. - complete or maximum linkage uses the maximum distances between all features of the two sets.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||
description='Decides what semantic type to attach to generated attributes', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKFeatureAgglomeration(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn FeatureAgglomeration | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.DATA_STREAM_CLUSTERING, ], | |||
"name": "sklearn.cluster.hierarchical.FeatureAgglomeration", | |||
"primitive_family": metadata_base.PrimitiveFamily.DATA_PREPROCESSING, | |||
"python_path": "d3m.primitives.data_preprocessing.feature_agglomeration.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html']}, | |||
"version": "2019.11.13", | |||
"id": "f259b009-5e0f-37b1-b117-441aba2b65c8", | |||
"hyperparams_to_tune": ['n_clusters', 'affinity', 'linkage'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = FeatureAgglomeration( | |||
n_clusters=self.hyperparams['n_clusters'], | |||
affinity=self.hyperparams['affinity'], | |||
compute_full_tree=self.hyperparams['compute_full_tree'], | |||
linkage=self.hyperparams['linkage'], | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
def set_training_data(self, *, inputs: Inputs) -> None: | |||
self._inputs = inputs | |||
self._fitted = False | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._fitted: | |||
return CallResult(None) | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if self._training_inputs is None: | |||
return CallResult(None) | |||
if len(self._training_indices) > 0: | |||
self._clf.fit(self._training_inputs) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
sk_inputs = inputs | |||
if self.hyperparams['use_semantic_types']: | |||
sk_inputs = inputs.iloc[:, self._training_indices] | |||
output_columns = [] | |||
if len(self._training_indices) > 0: | |||
sk_output = self._clf.transform(sk_inputs) | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
outputs = self._wrap_predictions(inputs, sk_output) | |||
if len(outputs.columns) == len(self._input_column_names): | |||
outputs.columns = self._input_column_names | |||
output_columns = [outputs] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._training_indices, | |||
columns_list=output_columns) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
labels_=None, | |||
n_leaves_=None, | |||
children_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
labels_=getattr(self._clf, 'labels_', None), | |||
n_leaves_=getattr(self._clf, 'n_leaves_', None), | |||
children_=getattr(self._clf, 'children_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.labels_ = params['labels_'] | |||
self._clf.n_leaves_ = params['n_leaves_'] | |||
self._clf.children_ = params['children_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['labels_'] is not None: | |||
self._fitted = True | |||
if params['n_leaves_'] is not None: | |||
self._fitted = True | |||
if params['children_'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_columns'], | |||
exclude_columns=hyperparams['exclude_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = [] | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_name = "output_{}".format(column_index) | |||
column_metadata = OrderedDict() | |||
semantic_types = set() | |||
semantic_types.add(hyperparams["return_semantic_type"]) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKFeatureAgglomeration.__doc__ = FeatureAgglomeration.__doc__ |
@@ -1,492 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.naive_bayes import GaussianNB | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
class_prior_: Optional[ndarray] | |||
class_count_: Optional[ndarray] | |||
theta_: Optional[ndarray] | |||
sigma_: Optional[ndarray] | |||
classes_: Optional[ndarray] | |||
epsilon_: Optional[float] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
var_smoothing = hyperparams.Bounded[float]( | |||
lower=0, | |||
upper=None, | |||
default=1e-09, | |||
description='Portion of the largest variance of all features that is added to variances for calculation stability.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKGaussianNB(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams], | |||
ProbabilisticCompositionalityMixin[Inputs, Outputs, Params, Hyperparams], | |||
ContinueFitMixin[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn GaussianNB | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.NAIVE_BAYES_CLASSIFIER, ], | |||
"name": "sklearn.naive_bayes.GaussianNB", | |||
"primitive_family": metadata_base.PrimitiveFamily.CLASSIFICATION, | |||
"python_path": "d3m.primitives.classification.gaussian_naive_bayes.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html']}, | |||
"version": "2019.11.13", | |||
"id": "464783a8-771e-340d-999b-ae90b9f84f0b", | |||
"hyperparams_to_tune": ['var_smoothing'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None, | |||
_priors: Union[ndarray, None] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = GaussianNB( | |||
var_smoothing=self.hyperparams['var_smoothing'], | |||
priors=_priors | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def continue_fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._training_inputs is None or self._training_outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.partial_fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
class_prior_=None, | |||
class_count_=None, | |||
theta_=None, | |||
sigma_=None, | |||
classes_=None, | |||
epsilon_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
class_prior_=getattr(self._clf, 'class_prior_', None), | |||
class_count_=getattr(self._clf, 'class_count_', None), | |||
theta_=getattr(self._clf, 'theta_', None), | |||
sigma_=getattr(self._clf, 'sigma_', None), | |||
classes_=getattr(self._clf, 'classes_', None), | |||
epsilon_=getattr(self._clf, 'epsilon_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.class_prior_ = params['class_prior_'] | |||
self._clf.class_count_ = params['class_count_'] | |||
self._clf.theta_ = params['theta_'] | |||
self._clf.sigma_ = params['sigma_'] | |||
self._clf.classes_ = params['classes_'] | |||
self._clf.epsilon_ = params['epsilon_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['class_prior_'] is not None: | |||
self._fitted = True | |||
if params['class_count_'] is not None: | |||
self._fitted = True | |||
if params['theta_'] is not None: | |||
self._fitted = True | |||
if params['sigma_'] is not None: | |||
self._fitted = True | |||
if params['classes_'] is not None: | |||
self._fitted = True | |||
if params['epsilon_'] is not None: | |||
self._fitted = True | |||
def log_likelihoods(self, *, | |||
outputs: Outputs, | |||
inputs: Inputs, | |||
timeout: float = None, | |||
iterations: int = None) -> CallResult[Sequence[float]]: | |||
inputs = inputs.iloc[:, self._training_indices] # Get ndarray | |||
outputs = outputs.iloc[:, self._target_column_indices] | |||
if len(inputs.columns) and len(outputs.columns): | |||
if outputs.shape[1] != self._clf.n_outputs_: | |||
raise exceptions.InvalidArgumentValueError("\"outputs\" argument does not have the correct number of target columns.") | |||
log_proba = self._clf.predict_log_proba(inputs) | |||
# Making it always a list, even when only one target. | |||
if self._clf.n_outputs_ == 1: | |||
log_proba = [log_proba] | |||
classes = [self._clf.classes_] | |||
else: | |||
classes = self._clf.classes_ | |||
samples_length = inputs.shape[0] | |||
log_likelihoods = [] | |||
for k in range(self._clf.n_outputs_): | |||
# We have to map each class to its internal (numerical) index used in the learner. | |||
# This allows "outputs" to contain string classes. | |||
outputs_column = outputs.iloc[:, k] | |||
classes_map = pandas.Series(numpy.arange(len(classes[k])), index=classes[k]) | |||
mapped_outputs_column = outputs_column.map(classes_map) | |||
# For each target column (column in "outputs"), for each sample (row) we pick the log | |||
# likelihood for a given class. | |||
log_likelihoods.append(log_proba[k][numpy.arange(samples_length), mapped_outputs_column]) | |||
results = d3m_dataframe(dict(enumerate(log_likelihoods)), generate_metadata=True) | |||
results.columns = outputs.columns | |||
for k in range(self._clf.n_outputs_): | |||
column_metadata = outputs.metadata.query_column(k) | |||
if 'name' in column_metadata: | |||
results.metadata = results.metadata.update_column(k, {'name': column_metadata['name']}) | |||
else: | |||
results = d3m_dataframe(generate_metadata=True) | |||
return CallResult(results) | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKGaussianNB.__doc__ = GaussianNB.__doc__ |
@@ -1,463 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.gaussian_process.gpr import GaussianProcessRegressor | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.supervised_learning import SupervisedLearnerPrimitiveBase | |||
from d3m.primitive_interfaces.base import ProbabilisticCompositionalityMixin, ContinueFitMixin | |||
from d3m import exceptions | |||
import pandas | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
X_train_: Optional[ndarray] | |||
y_train_: Optional[ndarray] | |||
kernel_: Optional[Callable] | |||
alpha_: Optional[ndarray] | |||
log_marginal_likelihood_value_: Optional[float] | |||
_y_train_mean: Optional[ndarray] | |||
_rng: Optional[numpy.random.mtrand.RandomState] | |||
L_: Optional[ndarray] | |||
_K_inv: Optional[object] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
alpha = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'float': hyperparams.Hyperparameter[float]( | |||
default=1e-10, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'ndarray': hyperparams.Hyperparameter[ndarray]( | |||
default=numpy.array([]), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='float', | |||
description='Value added to the diagonal of the kernel matrix during fitting. Larger values correspond to increased noise level in the observations and reduce potential numerical issue during fitting. If an array is passed, it must have the same number of entries as the data used for fitting and is used as datapoint-dependent noise level. Note that this is equivalent to adding a WhiteKernel with c=alpha. Allowing to specify the noise level directly as a parameter is mainly for convenience and for consistency with Ridge.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
optimizer = hyperparams.Constant( | |||
default='fmin_l_bfgs_b', | |||
description='Can either be one of the internally supported optimizers for optimizing the kernel\'s parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * \'obj_func\' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * \'initial_theta\': the initial value for theta, which can be # used by local optimizers # * \'bounds\': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the \'fmin_l_bfgs_b\' algorithm from scipy.optimize is used. If None is passed, the kernel\'s parameters are kept fixed. Available internal optimizers are:: \'fmin_l_bfgs_b\'', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
n_restarts_optimizer = hyperparams.Bounded[int]( | |||
default=0, | |||
lower=0, | |||
upper=None, | |||
description='The number of restarts of the optimizer for finding the kernel\'s parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel\'s initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer == 0 implies that one run is performed.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
normalize_y = hyperparams.UniformBool( | |||
default=False, | |||
description='Whether the target values y are normalized, i.e., the mean of the observed target values become zero. This parameter should be set to True if the target values\' mean is expected to differ considerable from zero. When enabled, the normalization effectively modifies the GP\'s prior based on the data, which contradicts the likelihood principle; normalization is thus disabled per default. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training input. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
use_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to use as training target. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_inputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training inputs. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
exclude_outputs_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not use as training target. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute', 'https://metadata.datadrivendiscovery.org/types/PredictedTarget'], | |||
default='https://metadata.datadrivendiscovery.org/types/PredictedTarget', | |||
description='Decides what semantic type to attach to generated output', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKGaussianProcessRegressor(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn GaussianProcessRegressor | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.GAUSSIAN_PROCESS, ], | |||
"name": "sklearn.gaussian_process.gpr.GaussianProcessRegressor", | |||
"primitive_family": metadata_base.PrimitiveFamily.REGRESSION, | |||
"python_path": "d3m.primitives.regression.gaussian_process.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html']}, | |||
"version": "2019.11.13", | |||
"id": "3894e630-d67b-35d9-ab78-233e264f6324", | |||
"hyperparams_to_tune": ['alpha'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = GaussianProcessRegressor( | |||
alpha=self.hyperparams['alpha'], | |||
optimizer=self.hyperparams['optimizer'], | |||
n_restarts_optimizer=self.hyperparams['n_restarts_optimizer'], | |||
normalize_y=self.hyperparams['normalize_y'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
self._new_training_data = False | |||
def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: | |||
self._inputs = inputs | |||
self._outputs = outputs | |||
self._fitted = False | |||
self._new_training_data = True | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._inputs is None or self._outputs is None: | |||
raise ValueError("Missing training data.") | |||
if not self._new_training_data: | |||
return CallResult(None) | |||
self._new_training_data = False | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._training_outputs, self._target_names, self._target_column_indices = self._get_targets(self._outputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if len(self._training_indices) > 0 and len(self._target_column_indices) > 0: | |||
self._target_columns_metadata = self._get_target_columns_metadata(self._training_outputs.metadata, self.hyperparams) | |||
sk_training_output = self._training_outputs.values | |||
shape = sk_training_output.shape | |||
if len(shape) == 2 and shape[1] == 1: | |||
sk_training_output = numpy.ravel(sk_training_output) | |||
self._clf.fit(self._training_inputs, sk_training_output) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
sk_inputs, columns_to_use = self._get_columns_to_fit(inputs, self.hyperparams) | |||
output = [] | |||
if len(sk_inputs.columns): | |||
try: | |||
sk_output = self._clf.predict(sk_inputs) | |||
except sklearn.exceptions.NotFittedError as error: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") from error | |||
# For primitives that allow predicting without fitting like GaussianProcessRegressor | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
output = self._wrap_predictions(inputs, sk_output) | |||
output.columns = self._target_names | |||
output = [output] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._target_column_indices, | |||
columns_list=output) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
X_train_=None, | |||
y_train_=None, | |||
kernel_=None, | |||
alpha_=None, | |||
log_marginal_likelihood_value_=None, | |||
_y_train_mean=None, | |||
_rng=None, | |||
L_=None, | |||
_K_inv=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
X_train_=getattr(self._clf, 'X_train_', None), | |||
y_train_=getattr(self._clf, 'y_train_', None), | |||
kernel_=getattr(self._clf, 'kernel_', None), | |||
alpha_=getattr(self._clf, 'alpha_', None), | |||
log_marginal_likelihood_value_=getattr(self._clf, 'log_marginal_likelihood_value_', None), | |||
_y_train_mean=getattr(self._clf, '_y_train_mean', None), | |||
_rng=getattr(self._clf, '_rng', None), | |||
L_=getattr(self._clf, 'L_', None), | |||
_K_inv=getattr(self._clf, '_K_inv', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.X_train_ = params['X_train_'] | |||
self._clf.y_train_ = params['y_train_'] | |||
self._clf.kernel_ = params['kernel_'] | |||
self._clf.alpha_ = params['alpha_'] | |||
self._clf.log_marginal_likelihood_value_ = params['log_marginal_likelihood_value_'] | |||
self._clf._y_train_mean = params['_y_train_mean'] | |||
self._clf._rng = params['_rng'] | |||
self._clf.L_ = params['L_'] | |||
self._clf._K_inv = params['_K_inv'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['X_train_'] is not None: | |||
self._fitted = True | |||
if params['y_train_'] is not None: | |||
self._fitted = True | |||
if params['kernel_'] is not None: | |||
self._fitted = True | |||
if params['alpha_'] is not None: | |||
self._fitted = True | |||
if params['log_marginal_likelihood_value_'] is not None: | |||
self._fitted = True | |||
if params['_y_train_mean'] is not None: | |||
self._fitted = True | |||
if params['_rng'] is not None: | |||
self._fitted = True | |||
if params['L_'] is not None: | |||
self._fitted = True | |||
if params['_K_inv'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_inputs_columns'], | |||
exclude_columns=hyperparams['exclude_inputs_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_targets(cls, data: d3m_dataframe, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return data, list(data.columns), list(range(len(data.columns))) | |||
metadata = data.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/TrueTarget") | |||
column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
target_column_indices, target_columns_not_to_produce = base_utils.get_columns_to_use(metadata, | |||
use_columns=hyperparams[ | |||
'use_outputs_columns'], | |||
exclude_columns= | |||
hyperparams[ | |||
'exclude_outputs_columns'], | |||
can_use_column=can_produce_column) | |||
targets = [] | |||
if target_column_indices: | |||
targets = data.select_columns(target_column_indices) | |||
target_column_names = [] | |||
for idx in target_column_indices: | |||
target_column_names.append(data.columns[idx]) | |||
return targets, target_column_names, target_column_indices | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set(["https://metadata.datadrivendiscovery.org/types/TrueTarget","https://metadata.datadrivendiscovery.org/types/SuggestedTarget",]) | |||
add_semantic_types = set(["https://metadata.datadrivendiscovery.org/types/PredictedTarget",]) | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=False) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, self._target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict() | |||
semantic_types = [] | |||
semantic_types.append('https://metadata.datadrivendiscovery.org/types/PredictedTarget') | |||
column_name = outputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)).get("name") | |||
if column_name is None: | |||
column_name = "output_{}".format(column_index) | |||
column_metadata["semantic_types"] = semantic_types | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKGaussianProcessRegressor.__doc__ = GaussianProcessRegressor.__doc__ |
@@ -1,344 +0,0 @@ | |||
from typing import Any, Callable, List, Dict, Union, Optional, Sequence, Tuple | |||
from numpy import ndarray | |||
from collections import OrderedDict | |||
from scipy import sparse | |||
import os | |||
import sklearn | |||
import numpy | |||
import typing | |||
# Custom import commands if any | |||
from sklearn.random_projection import GaussianRandomProjection | |||
from d3m.container.numpy import ndarray as d3m_ndarray | |||
from d3m.container import DataFrame as d3m_dataframe | |||
from d3m.metadata import hyperparams, params, base as metadata_base | |||
from d3m import utils | |||
from d3m.base import utils as base_utils | |||
from d3m.exceptions import PrimitiveNotFittedError | |||
from d3m.primitive_interfaces.base import CallResult, DockerContainer | |||
from d3m.primitive_interfaces.unsupervised_learning import UnsupervisedLearnerPrimitiveBase | |||
Inputs = d3m_dataframe | |||
Outputs = d3m_dataframe | |||
class Params(params.Params): | |||
n_component_: Optional[int] | |||
components_: Optional[Union[ndarray, sparse.spmatrix]] | |||
input_column_names: Optional[Any] | |||
target_names_: Optional[Sequence[Any]] | |||
training_indices_: Optional[Sequence[int]] | |||
target_column_indices_: Optional[Sequence[int]] | |||
target_columns_metadata_: Optional[List[OrderedDict]] | |||
class Hyperparams(hyperparams.Hyperparams): | |||
n_components = hyperparams.Union( | |||
configuration=OrderedDict({ | |||
'int': hyperparams.Bounded[int]( | |||
lower=0, | |||
upper=None, | |||
default=100, | |||
description='Number of components to keep.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
), | |||
'auto': hyperparams.Constant( | |||
default='auto', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'], | |||
) | |||
}), | |||
default='auto', | |||
description='Dimensionality of the target projection space. n_components can be automatically adjusted according to the number of samples in the dataset and the bound given by the Johnson-Lindenstrauss lemma. In that case the quality of the embedding is controlled by the ``eps`` parameter. It should be noted that Johnson-Lindenstrauss lemma can yield very conservative estimated of the required number of components as it makes no assumption on the structure of the dataset.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
eps = hyperparams.Bounded[float]( | |||
default=0.1, | |||
lower=0, | |||
upper=1, | |||
description='Parameter to control the quality of the embedding according to the Johnson-Lindenstrauss lemma when n_components is set to \'auto\'. Smaller values lead to better embedding and higher number of dimensions (n_components) in the target projection space.', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/TuningParameter'] | |||
) | |||
use_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to force primitive to operate on. If any specified column cannot be parsed, it is skipped.", | |||
) | |||
exclude_columns = hyperparams.Set( | |||
elements=hyperparams.Hyperparameter[int](-1), | |||
default=(), | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="A set of column indices to not operate on. Applicable only if \"use_columns\" is not provided.", | |||
) | |||
return_result = hyperparams.Enumeration( | |||
values=['append', 'replace', 'new'], | |||
default='new', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Should parsed columns be appended, should they replace original columns, or should only parsed columns be returned? This hyperparam is ignored if use_semantic_types is set to false.", | |||
) | |||
use_semantic_types = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Controls whether semantic_types metadata will be used for filtering columns in input dataframe. Setting this to false makes the code ignore return_result and will produce only the output dataframe" | |||
) | |||
add_index_columns = hyperparams.UniformBool( | |||
default=False, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Also include primary index columns if input data has them. Applicable only if \"return_result\" is set to \"new\".", | |||
) | |||
error_on_no_input = hyperparams.UniformBool( | |||
default=True, | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'], | |||
description="Throw an exception if no input column is selected/provided. Defaults to true to behave like sklearn. To prevent pipelines from breaking set this to False.", | |||
) | |||
return_semantic_type = hyperparams.Enumeration[str]( | |||
values=['https://metadata.datadrivendiscovery.org/types/Attribute', 'https://metadata.datadrivendiscovery.org/types/ConstructedAttribute'], | |||
default='https://metadata.datadrivendiscovery.org/types/Attribute', | |||
description='Decides what semantic type to attach to generated attributes', | |||
semantic_types=['https://metadata.datadrivendiscovery.org/types/ControlParameter'] | |||
) | |||
class SKGaussianRandomProjection(UnsupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): | |||
""" | |||
Primitive wrapping for sklearn GaussianRandomProjection | |||
`sklearn documentation <https://scikit-learn.org/stable/modules/generated/sklearn.random_projection.GaussianRandomProjection.html>`_ | |||
""" | |||
__author__ = "JPL MARVIN" | |||
metadata = metadata_base.PrimitiveMetadata({ | |||
"algorithm_types": [metadata_base.PrimitiveAlgorithmType.RANDOM_PROJECTION, ], | |||
"name": "sklearn.random_projection.GaussianRandomProjection", | |||
"primitive_family": metadata_base.PrimitiveFamily.DATA_TRANSFORMATION, | |||
"python_path": "d3m.primitives.data_transformation.gaussian_random_projection.SKlearn", | |||
"source": {'name': 'JPL', 'contact': 'mailto:shah@jpl.nasa.gov', 'uris': ['https://gitlab.com/datadrivendiscovery/sklearn-wrap/issues', 'https://scikit-learn.org/stable/modules/generated/sklearn.random_projection.GaussianRandomProjection.html']}, | |||
"version": "2019.11.13", | |||
"id": "fc933ab9-baaf-47ca-a373-bdd33081f5fa", | |||
"hyperparams_to_tune": ['n_components'], | |||
'installation': [ | |||
{'type': metadata_base.PrimitiveInstallationType.PIP, | |||
'package_uri': 'git+https://gitlab.com/datadrivendiscovery/sklearn-wrap.git@{git_commit}#egg=sklearn_wrap'.format( | |||
git_commit=utils.current_git_commit(os.path.dirname(__file__)), | |||
), | |||
}] | |||
}) | |||
def __init__(self, *, | |||
hyperparams: Hyperparams, | |||
random_seed: int = 0, | |||
docker_containers: Dict[str, DockerContainer] = None) -> None: | |||
super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) | |||
# False | |||
self._clf = GaussianRandomProjection( | |||
n_components=self.hyperparams['n_components'], | |||
eps=self.hyperparams['eps'], | |||
random_state=self.random_seed, | |||
) | |||
self._inputs = None | |||
self._outputs = None | |||
self._training_inputs = None | |||
self._training_outputs = None | |||
self._target_names = None | |||
self._training_indices = None | |||
self._target_column_indices = None | |||
self._target_columns_metadata: List[OrderedDict] = None | |||
self._input_column_names = None | |||
self._fitted = False | |||
def set_training_data(self, *, inputs: Inputs) -> None: | |||
self._inputs = inputs | |||
self._fitted = False | |||
def fit(self, *, timeout: float = None, iterations: int = None) -> CallResult[None]: | |||
if self._fitted: | |||
return CallResult(None) | |||
self._training_inputs, self._training_indices = self._get_columns_to_fit(self._inputs, self.hyperparams) | |||
self._input_column_names = self._training_inputs.columns | |||
if self._training_inputs is None: | |||
return CallResult(None) | |||
if len(self._training_indices) > 0: | |||
self._clf.fit(self._training_inputs) | |||
self._fitted = True | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
return CallResult(None) | |||
def produce(self, *, inputs: Inputs, timeout: float = None, iterations: int = None) -> CallResult[Outputs]: | |||
if not self._fitted: | |||
raise PrimitiveNotFittedError("Primitive not fitted.") | |||
sk_inputs = inputs | |||
if self.hyperparams['use_semantic_types']: | |||
sk_inputs = inputs.iloc[:, self._training_indices] | |||
output_columns = [] | |||
if len(self._training_indices) > 0: | |||
sk_output = self._clf.transform(sk_inputs) | |||
if sparse.issparse(sk_output): | |||
sk_output = sk_output.toarray() | |||
outputs = self._wrap_predictions(inputs, sk_output) | |||
if len(outputs.columns) == len(self._input_column_names): | |||
outputs.columns = self._input_column_names | |||
output_columns = [outputs] | |||
else: | |||
if self.hyperparams['error_on_no_input']: | |||
raise RuntimeError("No input columns were selected") | |||
self.logger.warn("No input columns were selected") | |||
outputs = base_utils.combine_columns(return_result=self.hyperparams['return_result'], | |||
add_index_columns=self.hyperparams['add_index_columns'], | |||
inputs=inputs, column_indices=self._training_indices, | |||
columns_list=output_columns) | |||
return CallResult(outputs) | |||
def get_params(self) -> Params: | |||
if not self._fitted: | |||
return Params( | |||
n_component_=None, | |||
components_=None, | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
return Params( | |||
n_component_=getattr(self._clf, 'n_component_', None), | |||
components_=getattr(self._clf, 'components_', None), | |||
input_column_names=self._input_column_names, | |||
training_indices_=self._training_indices, | |||
target_names_=self._target_names, | |||
target_column_indices_=self._target_column_indices, | |||
target_columns_metadata_=self._target_columns_metadata | |||
) | |||
def set_params(self, *, params: Params) -> None: | |||
self._clf.n_component_ = params['n_component_'] | |||
self._clf.components_ = params['components_'] | |||
self._input_column_names = params['input_column_names'] | |||
self._training_indices = params['training_indices_'] | |||
self._target_names = params['target_names_'] | |||
self._target_column_indices = params['target_column_indices_'] | |||
self._target_columns_metadata = params['target_columns_metadata_'] | |||
if params['n_component_'] is not None: | |||
self._fitted = True | |||
if params['components_'] is not None: | |||
self._fitted = True | |||
@classmethod | |||
def _get_columns_to_fit(cls, inputs: Inputs, hyperparams: Hyperparams): | |||
if not hyperparams['use_semantic_types']: | |||
return inputs, list(range(len(inputs.columns))) | |||
inputs_metadata = inputs.metadata | |||
def can_produce_column(column_index: int) -> bool: | |||
return cls._can_produce_column(inputs_metadata, column_index, hyperparams) | |||
columns_to_produce, columns_not_to_produce = base_utils.get_columns_to_use(inputs_metadata, | |||
use_columns=hyperparams['use_columns'], | |||
exclude_columns=hyperparams['exclude_columns'], | |||
can_use_column=can_produce_column) | |||
return inputs.iloc[:, columns_to_produce], columns_to_produce | |||
# return columns_to_produce | |||
@classmethod | |||
def _can_produce_column(cls, inputs_metadata: metadata_base.DataMetadata, column_index: int, hyperparams: Hyperparams) -> bool: | |||
column_metadata = inputs_metadata.query((metadata_base.ALL_ELEMENTS, column_index)) | |||
accepted_structural_types = (int, float, numpy.integer, numpy.float64) | |||
accepted_semantic_types = set() | |||
accepted_semantic_types.add("https://metadata.datadrivendiscovery.org/types/Attribute") | |||
if not issubclass(column_metadata['structural_type'], accepted_structural_types): | |||
return False | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
if len(semantic_types) == 0: | |||
cls.logger.warning("No semantic types found in column metadata") | |||
return False | |||
# Making sure all accepted_semantic_types are available in semantic_types | |||
if len(accepted_semantic_types - semantic_types) == 0: | |||
return True | |||
return False | |||
@classmethod | |||
def _get_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams) -> List[OrderedDict]: | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_metadata = OrderedDict(outputs_metadata.query_column(column_index)) | |||
# Update semantic types and prepare it for predicted targets. | |||
semantic_types = set(column_metadata.get('semantic_types', [])) | |||
semantic_types_to_remove = set([]) | |||
add_semantic_types = [] | |||
add_semantic_types.add(hyperparams["return_semantic_type"]) | |||
semantic_types = semantic_types - semantic_types_to_remove | |||
semantic_types = semantic_types.union(add_semantic_types) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
@classmethod | |||
def _update_predictions_metadata(cls, inputs_metadata: metadata_base.DataMetadata, outputs: Optional[Outputs], | |||
target_columns_metadata: List[OrderedDict]) -> metadata_base.DataMetadata: | |||
outputs_metadata = metadata_base.DataMetadata().generate(value=outputs) | |||
for column_index, column_metadata in enumerate(target_columns_metadata): | |||
column_metadata.pop("structural_type", None) | |||
outputs_metadata = outputs_metadata.update_column(column_index, column_metadata) | |||
return outputs_metadata | |||
def _wrap_predictions(self, inputs: Inputs, predictions: ndarray) -> Outputs: | |||
outputs = d3m_dataframe(predictions, generate_metadata=True) | |||
target_columns_metadata = self._add_target_columns_metadata(outputs.metadata, self.hyperparams) | |||
outputs.metadata = self._update_predictions_metadata(inputs.metadata, outputs, target_columns_metadata) | |||
return outputs | |||
@classmethod | |||
def _add_target_columns_metadata(cls, outputs_metadata: metadata_base.DataMetadata, hyperparams): | |||
outputs_length = outputs_metadata.query((metadata_base.ALL_ELEMENTS,))['dimension']['length'] | |||
target_columns_metadata: List[OrderedDict] = [] | |||
for column_index in range(outputs_length): | |||
column_name = "output_{}".format(column_index) | |||
column_metadata = OrderedDict() | |||
semantic_types = set() | |||
semantic_types.add(hyperparams["return_semantic_type"]) | |||
column_metadata['semantic_types'] = list(semantic_types) | |||
column_metadata["name"] = str(column_name) | |||
target_columns_metadata.append(column_metadata) | |||
return target_columns_metadata | |||
SKGaussianRandomProjection.__doc__ = GaussianRandomProjection.__doc__ |